From 2e10f678a530d9aed6db6aa2a42ff56b5abbcd67 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Fri, 7 Jul 2023 16:55:13 -0500
Subject: [PATCH 001/234] hipify the csrc repo

---
 csrc/kernels.hip           | 3865 ++++++++++++++++++++++++++++++++++++
 csrc/kernels_hip.cuh       |  132 ++
 csrc/ops.hip               |  850 ++++++++
 csrc/ops_hip.cuh           |  207 ++
 csrc/pythonInterface_hip.c |  375 ++++
 5 files changed, 5429 insertions(+)
 create mode 100644 csrc/kernels.hip
 create mode 100644 csrc/kernels_hip.cuh
 create mode 100644 csrc/ops.hip
 create mode 100644 csrc/ops_hip.cuh
 create mode 100644 csrc/pythonInterface_hip.c

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
new file mode 100644
index 000000000..4724f57c0
--- /dev/null
+++ b/csrc/kernels.hip
@@ -0,0 +1,3865 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <kernels.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/warp/warp_reduce.cuh>
+#include <hipcub/hipcub.hpp>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_store.cuh>
+#include <hipcub/hipcub.hpp>
+#include <hipcub/hipcub.hpp>
+#include <math_constants.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <mma.h>
+
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+
+// source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ float atomicMax(float* address, float val) {
+  int* address_as_i = reinterpret_cast<int*>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        reinterpret_cast<int*>(address), assumed,
+        __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+__device__ float atomicMin(float* address, float val) {
+  int* address_as_i = reinterpret_cast<int*>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        reinterpret_cast<int*>(address), assumed,
+        __float_as_int(fminf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+__device__ float dDequantizeFP4(unsigned char val, float absmax)
+{
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if((val & 0b0110) == 0)
+  {
+    // subnormal
+    if((val & 0b0001) == 0)
+      return 0.0f;
+    else
+      return sign*0.0625f*absmax;
+  }
+  else
+  {
+    // normal
+    float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
+    float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;
+
+    return sign*exponent*fraction*absmax;
+  }
+}
+
+__device__ float d2DequantizeFP4(unsigned char val)
+{
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if((val & 0b0110) == 0)
+  {
+    // subnormal
+    if((val & 0b0001) == 0)
+      return 0.0f;
+    else
+      return sign*0.0625f;
+  }
+  else
+  {
+    // normal
+    float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
+    float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;
+
+    return sign*exponent*fraction;
+  }
+}
+
+__device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
+{
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if((val & 0b0100) == 4) // 0
+    if((val & 0b0010) == 2) //01
+      if((val & 0b0001) == 1) // 111
+        return 0.25000000f*absmax*sign; // 1111
+      else
+        return 0.16666667f*absmax*sign; // 1110
+    else
+      if((val & 0b0001) == 1) // 110
+        return 0.50000000f*absmax*sign; // 1101
+      else
+        return 0.33333333f*absmax*sign; // 1100
+  else
+    if((val & 0b0010) == 2) //10
+      if((val & 0b0001) == 1) // 101
+        return 1.00000000f*absmax*sign; // 1011
+      else
+        return 0.66666667f*absmax*sign; // 1010
+    else 
+      if((val & 0b0001) == 1) // 100
+        return 5.208333333e-03f*absmax*sign; // 1001
+      else
+        return 0.00000000f*absmax*sign; // 1000
+}
+
+__device__ unsigned char dQuantizeFP4(float x)
+{
+  // FP4 with bias of 3
+  // first bit is a sign
+  // subnormals
+  // 0b000 = 0
+  // 0b001 = 0.0625
+  // 0b110 = 2
+  // 0b111 = 3
+  // 0b100 = 4
+  // 0b101 = 6
+  // 0b010 = 8
+  // 0b011 = 12
+
+
+  // we do a binary search
+  // the pivots are divided by 12 (the FP4 absmax)
+  // since we assum input data is in [-1.0, 1.0]
+
+  // !be careful here, its easy to make a mistake
+  // that is difficult to noice if you add an extra
+  // zero somewhere!
+
+  int sign = x < 0 ? 0b1000 : 0b0000;
+  x = fabsf(x);
+  if(x > 0.29166667f)
+    if( x > 0.583333f)
+      if( x > 0.8333333f)
+        return 0b0011+sign;
+      else
+        return 0b0010+sign;
+    else
+      if(x > 0.4166667f)
+        return 0b101+sign;
+      else
+        return 0b100+sign;
+  else
+    if(x > 0.0859375f)
+      if(x > 0.20833333f)
+        return 0b0111+sign;
+      else
+        return 0b0110+sign;
+    else
+      if(x > 0.00260417f)
+        return 0b0001+sign;
+      else
+        return 0b0000+sign;
+}
+
+__device__ half dhDequantizeNF4(unsigned char val)
+{
+  // the values for this tree was generated by test_normal_map_tree
+  // in the file tests/test_functional.py
+  if((val & 0b1000) == 8)
+    if((val & 0b0100) == 4) // 1
+      if((val & 0b0010) == 2) // 11
+        if((val & 0b0001) == 1) // 111
+          return 1.0f; 
+        else
+          return 0.7229568362236023f;
+      else
+        if((val & 0b0001) == 1) // 110
+          return 0.5626170039176941f; 
+        else
+          return 0.44070982933044434f; 
+    else
+      if((val & 0b0010) == 2) //10
+        if((val & 0b0001) == 1) // 101
+          return 0.33791524171829224f; 
+        else
+          return 0.24611230194568634f; 
+      else 
+        if((val & 0b0001) == 1) // 100
+          return 0.16093020141124725f; 
+        else
+          return 0.07958029955625534f; 
+
+  else
+    if((val & 0b0100) == 4) // 0
+      if((val & 0b0010) == 2) //01
+        if((val & 0b0001) == 1) // 011
+          return 0.0f; 
+        else
+          return -0.09105003625154495f; 
+      else
+        if((val & 0b0001) == 1) // 010
+          return -0.18477343022823334f; 
+        else
+          return -0.28444138169288635f;
+    else
+      if((val & 0b0010) == 2) //00
+        if((val & 0b0001) == 1) // 001
+          return -0.39491748809814453f;
+        else
+          return -0.5250730514526367f; 
+      else 
+        if((val & 0b0001) == 1) // 000
+          return -0.6961928009986877f; 
+        else
+          return -1.0f; 
+
+}
+
+__device__ float dDequantizeNF4(unsigned char val)
+{
+  // the values for this tree was generated by test_normal_map_tree
+  // in the file tests/test_functional.py
+  if((val & 0b1000) == 8)
+    if((val & 0b0100) == 4) // 1
+      if((val & 0b0010) == 2) // 11
+        if((val & 0b0001) == 1) // 111
+          return 1.0f; 
+        else
+          return 0.7229568362236023f;
+      else
+        if((val & 0b0001) == 1) // 110
+          return 0.5626170039176941f; 
+        else
+          return 0.44070982933044434f; 
+    else
+      if((val & 0b0010) == 2) //10
+        if((val & 0b0001) == 1) // 101
+          return 0.33791524171829224f; 
+        else
+          return 0.24611230194568634f; 
+      else 
+        if((val & 0b0001) == 1) // 100
+          return 0.16093020141124725f; 
+        else
+          return 0.07958029955625534f; 
+
+  else
+    if((val & 0b0100) == 4) // 0
+      if((val & 0b0010) == 2) //01
+        if((val & 0b0001) == 1) // 011
+          return 0.0f; 
+        else
+          return -0.09105003625154495f; 
+      else
+        if((val & 0b0001) == 1) // 010
+          return -0.18477343022823334f; 
+        else
+          return -0.28444138169288635f;
+    else
+      if((val & 0b0010) == 2) //00
+        if((val & 0b0001) == 1) // 001
+          return -0.39491748809814453f;
+        else
+          return -0.5250730514526367f; 
+      else 
+        if((val & 0b0001) == 1) // 000
+          return -0.6961928009986877f; 
+        else
+          return -1.0f; 
+
+}
+
+__device__ unsigned char dQuantizeNF4(float x)
+{
+
+  // the values for this tree was generated by test_normal_map_tree
+  // in the file tests/test_functional.py
+  if(x > 0.03979014977812767f)
+    if(x > 0.3893125355243683f) // 1
+      if(x > 0.6427869200706482f) // 11
+        if(x > 0.8614784181118011f) // 111
+          return 0b1111;
+        else
+          return 0b1110;
+      else
+        if(x > 0.5016634166240692f) // 110
+          return 0b1101;
+        else
+          return 0b1100;
+    else
+      if(x > 0.2035212516784668f) // 10
+        if(x > 0.2920137718319893f) // 101
+          return 0b1011;
+        else
+          return 0b1010;
+      else
+        if(x > 0.1202552504837513f) // 100
+          return 0b1001;
+        else
+          return 0b1000;
+  else
+    if(x > -0.33967943489551544f) // 0
+      if(x > -0.13791173323988914f) // 01
+        if(x > -0.045525018125772476f) // 011
+          return 0b0111;
+        else
+          return 0b0110;
+      else
+        if(x > -0.23460740596055984f) // 010
+          return 0b0101;
+        else
+          return 0b0100;
+    else
+      if(x > -0.6106329262256622f) // 00
+        if(x > -0.4599952697753906f) // 001
+          return 0b0011;
+        else
+          return 0b0010;
+      else
+        if(x > -0.8480964004993439f) // 000
+          return 0b0001;
+        else
+          return 0b0000;
+}
+// sign function for lion
+// taken from https://stackoverflow.com/a/4609795, but not sure if there's a proper way to do this in CUDA
+
+template <typename T> __device__ int sgn(T val)
+{
+  return (T(0) < val) - (val < T(0));
+}
+
+template <int STOCHASTIC>
+__device__ unsigned char dQuantize(float* smem_code, const float rand, float x)
+{
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = -1.0f;
+    float upper = 1.0f;
+
+    float val = smem_code[pivot];
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 64; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+        }
+        val = smem_code[pivot];
+    }
+
+    if(upper_pivot == 255)
+        upper = smem_code[upper_pivot];
+    if(lower_pivot == 0)
+        lower = smem_code[lower_pivot];
+
+    if(!STOCHASTIC)
+    {
+      if(x > val)
+      {
+        float midpoint = (upper+val)*0.5f;
+        if(x > midpoint)
+        {
+          return upper_pivot;
+        }
+        else
+          return pivot;
+      }
+      else
+      {
+        float midpoint = (lower+val)*0.5f;
+        if(x < midpoint)
+          return lower_pivot;
+        else
+          return pivot;
+      }
+    }
+    else
+    {
+      if(x > val)
+      {
+        float dist_to_upper = fabsf(upper-x);
+        float dist_full = upper-val;
+        if(rand >= dist_to_upper/dist_full) return upper_pivot;
+        else return pivot;
+      }
+      else
+      {
+        float dist_to_lower = fabsf(lower-x);
+        float dist_full = val-lower;
+        if(rand >= dist_to_lower/dist_full) return lower_pivot;
+        else return pivot;
+      }
+    }
+}
+
+template <int SIGNED>
+__device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadrants, float *__restrict__ const smem_code, float x)
+{
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = SIGNED ? -1.0f : 0.0f;
+    float upper = 1.0f;
+    float midpoint;
+    float val = quadrants[1];
+    int local_pivot = 1;
+    int offset = 1;
+
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 64; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+            //val = i == 64 ? quadrants[2] : smem_code[pivot];
+            local_pivot += offset;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+            //val = i == 64 ? quadrants[0] : smem_code[pivot];
+            local_pivot -= offset;
+        }
+        val = i >= 64 ? quadrants[local_pivot] : smem_code[pivot];
+        offset -= 1;
+    }
+
+    if(x > val)
+    {
+      midpoint = (upper+val)*0.5f;
+      if(x > midpoint)
+        return upper_pivot;
+      else
+        return pivot;
+    }
+    else
+    {
+      midpoint = (lower+val)*0.5f;
+      if(x < midpoint)
+        return lower_pivot;
+      else
+        return pivot;
+    }
+}
+
+template <int SIGNED>
+__device__ __forceinline__ unsigned char quantize_quadrant(int QUADRANT, float *__restrict__ const smem_code, float x, float lower, float midpoint, float upper)
+{
+    int lower_pivot = QUADRANT*16-1 - 0;
+    int pivot = QUADRANT*16-1 + 16;
+    int upper_pivot = QUADRANT*16-1 + 31;
+
+    float val = midpoint;
+
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 16; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+        }
+        val = smem_code[pivot];
+    }
+
+    if(x > val)
+    {
+      midpoint = (upper+val)*0.5f;
+      if(x > midpoint)
+        return upper_pivot;
+      else
+        return pivot;
+    }
+    else
+    {
+      midpoint = (lower+val)*0.5f;
+      if(x < midpoint)
+        return lower_pivot;
+      else
+        return pivot;
+    }
+}
+
+__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n)
+{
+  const int tid = threadIdx.x + (blockDim.x*blockIdx.x);
+  const int numThreads = blockDim.x*gridDim.x;
+
+  for(int i = tid; i < n; i+=numThreads)
+  {
+      int idx = (index1[i]*maxidx1) + index2[i];
+      atomicAdd(&histogram[idx], src[i]);
+  }
+}
+
+template<typename T, int BLOCK_SIZE, int NUM_MAX>
+__global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* out_idx, const int n)
+{
+  typedef hipcub::WarpReduce<T> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+  typedef cub::BlockLoad<T, BLOCK_SIZE/8 , 8, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+  __shared__ typename LoadT::TempStorage loadt;
+
+  const int warp_idx = threadIdx.x/32;
+  const int valid_items = n - (blockIdx.x*BLOCK_SIZE) > BLOCK_SIZE ? BLOCK_SIZE : n - (blockIdx.x*BLOCK_SIZE);
+
+  //  BLOCK_SIZE/32 == number of warps
+  __shared__ int smem_max_indices[8*BLOCK_SIZE/32];
+  __shared__ float smem_max_values[8*BLOCK_SIZE/32];
+
+  T values[8];
+  T max1 = -64000.0f;
+  T max2 = -64000.0f;
+  int max_idx1 = -1;
+  int max_idx2 = -1;
+  int sign1 = -1;
+  int sign2 = -1;
+
+  // 1. load 8 values per thread
+  // 2. compute 2-max in registers (64 max per warp)
+  // 3. do warp reduction + broadcast back
+  // 4. Up-shift maxed value, write index into shared memory, replace with 2nd largest
+  // 5. Repeat (3) 8 times for top 8 values in 256
+  // 6. store with byte index
+
+  LoadT(loadt).Load(&(A[(blockIdx.x*BLOCK_SIZE)]), values, valid_items, (T)0.0f);
+  #pragma unroll 8
+  for(int i = 0; i < 8; i++)
+  {
+    T absval = fabsf(values[i]);
+    if(absval > max1)
+    {
+      max1 = values[i];
+      sign1 = signbit(values[i]);
+      max_idx1 = 8*threadIdx.x + i;
+    }
+    else if(absval > max2)
+    {
+      max2 = values[i];
+      sign2 = signbit(values[i]);
+      max_idx2 = 8*threadIdx.x + i;
+    }
+  }
+
+  float warp_max;
+  for(int i = 0; i < 8; i++)
+  {
+    // 3. do warp reduction + broadcast back
+    warp_max = WarpReduce(temp_storage).Reduce(max1, hipcub::Max());
+    warp_max = cub::ShuffleIndex<32>(warp_max, 0, 0xffffffff);
+
+    // 4. Up-shift maxed value, write index into shared memory, replace with 2nd largest
+    if(warp_max == max1)
+    {
+      smem_max_values[warp_idx*8 + i] = sign1 != 0 ? -max1 : max1;
+      smem_max_indices[warp_idx*8 + i] = max_idx1;
+
+      sign1 = sign2;
+      max1 = max2;
+      max_idx1 = max_idx2;
+
+      max2 = -64000.0f;
+    }
+    __syncwarp();
+  }
+
+  if(threadIdx.x % 32 < 8)
+  {
+    // offset: 8 values per 256 input values
+    //
+    int offset = BLOCK_SIZE*blockIdx.x*BLOCK_SIZE/32*8;
+  }
+
+}
+
+#define THREADS_ESTIMATE 512
+#define NUM_ESTIMATE 8
+#define BLOCK_ESTIMATE 4096
+
+template<typename T>
+__launch_bounds__(THREADS_ESTIMATE, 1)
+__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n)
+{
+  const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE);
+  int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*BLOCK_ESTIMATE) : BLOCK_ESTIMATE;
+  const int base_idx = (blockIdx.x * BLOCK_ESTIMATE);
+  const float reciprocal_num_blocks = 1.0f/(n < 4096 ? 1.0f : (n/BLOCK_ESTIMATE));
+
+  T vals[NUM_ESTIMATE];
+
+  typedef cub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, cub::NullType, 4, true, cub::BLOCK_SCAN_RAKING> BlockRadixSort;
+  typedef cub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+
+  __shared__ union {
+      typename LoadFloat::TempStorage loadf;
+      typename BlockRadixSort::TempStorage sort;
+      int smem_qidx[BLOCK_ESTIMATE];
+  } temp_storage;
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_ESTIMATE)
+  {
+      valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i;
+
+      // do not process half-blocks
+      if(valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue; }
+
+      #pragma unroll 4
+      for(int j = 0; j < NUM_ESTIMATE; j++)
+          vals[j] = max_val;
+
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(A[i]), vals, valid_items);
+
+      #pragma unroll 4
+      for(int j = 0; j < NUM_ESTIMATE; j++)
+          vals[j] = ((float)vals[j]) * reciprocal_num_blocks;
+
+
+      __syncthreads();
+      // sort into striped pattern to mitigate bank conflicts
+      // striped pattern index for thread 0 [0, 1024, 2048, 3096]
+      // striped pattern index for thread 1 [1, 1025, 2049, 3097]
+      BlockRadixSort(temp_storage.sort).SortBlockedToStriped(vals);
+
+      __syncthreads();
+      for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x)
+          temp_storage.smem_qidx[j] = -1;
+
+      if(threadIdx.x < 256)
+      {
+          float q_interval = (1.0f-(2.0f*offset))/255.0f;
+          int local_idx = round(((offset+(threadIdx.x*q_interval))*(valid_items-1)));
+          temp_storage.smem_qidx[local_idx] = threadIdx.x;
+      }
+
+      __syncthreads();
+
+      for(int i = threadIdx.x; i < BLOCK_ESTIMATE; i+=blockDim.x)
+      {
+          if(temp_storage.smem_qidx[i] != -1)
+              atomicAdd(&code[temp_storage.smem_qidx[i]], vals[i/THREADS_ESTIMATE]);
+      }
+  }
+}
+
+
+__launch_bounds__(TH, 4)
+__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n)
+{
+  const int n_full = (NUM_BLOCK*(n/NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);
+  int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*NUM_BLOCK) : NUM_BLOCK;
+  const int base_idx = (blockIdx.x * NUM_BLOCK);
+
+  float vals[NUM];
+  unsigned char qvals[NUM];
+  //const int lane_id = threadIdx.x % 2;
+
+  typedef cub::BlockLoad<float, TH, NUM, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef cub::BlockStore<unsigned char, TH, NUM, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+
+  __shared__ typename LoadFloat::TempStorage loadf;
+  __shared__ typename StoreChar::TempStorage storec;
+  __shared__ float smem_code[256];
+  //__shared__ float smem_code[2][257];
+
+  if(threadIdx.x < 256)
+  {
+    smem_code[threadIdx.x] = code[threadIdx.x];
+    //smem_code[0][threadIdx.x] = code[threadIdx.x];
+    //smem_code[1][threadIdx.x] = smem_code[0][threadIdx.x];
+  }
+
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*NUM_BLOCK)
+  {
+      // number of values already processed in blocks +
+      // number of values already processed in this block +
+      // rand_offset % mod value
+      valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;
+
+      __syncthreads();
+      LoadFloat(loadf).Load(&(A[i]), vals, valid_items);
+
+
+      #pragma unroll 4
+      for(int j = 0; j < NUM; j++)
+          qvals[j] = dQuantize<0>(smem_code, 0.0f, vals[j]);
+
+      __syncthreads();
+      StoreChar(storec).Store(&(out[i]), qvals, valid_items);
+  }
+}
+
+template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
+//__launch_bounds__(TH, 4)
+__global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n)
+{
+  const int n_full = gridDim.x * BLOCK_SIZE;
+  int valid_items = 0;
+  const int base_idx = (blockIdx.x * BLOCK_SIZE);
+
+  T vals[NUM_PER_TH];
+  float rand_vals[NUM_PER_TH];
+  unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH];
+  //float local_abs_max = -FLT_MAX;
+  float local_abs_max = 0.0f;
+  int local_rand_idx = 0;
+
+  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+  typedef cub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+  typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_PER_TH> BlockReduce;
+  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+
+  __shared__ typename LoadT::TempStorage loadt;
+  __shared__ typename LoadFloat::TempStorage loadf;
+  __shared__ typename StoreChar::TempStorage storec;
+  __shared__ typename BlockReduce::TempStorage reduce;
+  __shared__ float smem_code[256];
+  __shared__ float smem_absmax_value[1];
+
+  if(DATA_TYPE == General8bit)
+    for(int i = threadIdx.x; i < 256; i+=blockDim.x)
+      smem_code[i] = code[i];
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+  {
+    valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
+    local_abs_max = -FLT_MAX;
+
+    __syncthreads();
+    LoadT(loadt).Load(&(A[i]), vals, valid_items, (T)0.0f);
+
+    // 1. compute local max
+    // 2. broadcast local max
+    // 3. normalize inputs and quantize
+
+    #pragma unroll NUM_PER_TH
+    for(int j = 0; j < NUM_PER_TH; j++)
+       local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));
+
+    local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, hipcub::Max(), valid_items);
+
+    if(threadIdx.x == 0)
+      smem_absmax_value[0] = local_abs_max;
+
+    __syncthreads();
+
+    if(threadIdx.x == 0)
+      absmax[i/BLOCK_SIZE] = local_abs_max;
+    else
+      local_abs_max = smem_absmax_value[0];
+
+    __syncwarp();
+
+    local_abs_max = 1.0f/local_abs_max;
+
+    if(STOCHASTIC)
+    {
+      local_rand_idx = ((blockIdx.x*NUM_BLOCK) + (threadIdx.x*NUM) + rand_offset) % (1024-4);
+      LoadFloat(loadf).Load(&rand[local_rand_idx], rand_vals, BLOCK_SIZE, 0);
+    }
+
+    unsigned char packed_4bit = 0;
+    switch(DATA_TYPE)
+    {
+        case General8bit:
+            #pragma unroll NUM_PER_TH
+            for(int j = 0; j < NUM_PER_TH; j++)
+            {
+                if(!STOCHASTIC)
+                 qvals[j] = dQuantize<0>(smem_code, 0.0f, ((float)vals[j])*local_abs_max);
+                else
+                 qvals[j] = dQuantize<1>(smem_code, rand_vals[j], ((float)vals[j])*local_abs_max);
+            }
+            break;
+        case FP4:
+            #pragma unroll NUM_PER_TH
+            for(int j = 0; j < NUM_PER_TH/2; j++)
+            {
+              packed_4bit |= dQuantizeFP4(((float)vals[2*j])*local_abs_max) << 4;
+              packed_4bit |= dQuantizeFP4(((float)vals[2*j+1])*local_abs_max);
+              qvals[j] = packed_4bit;
+            }
+            break;
+        case NF4:
+            #pragma unroll NUM_PER_TH
+            for(int j = 0; j < NUM_PER_TH/2; j++)
+            {
+              packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4;
+              packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max);
+              qvals[j] = packed_4bit;
+            }
+            break;
+    }
+
+    __syncthreads();
+    StoreChar(storec).Store(&(out[(DATA_TYPE > 0) ? i/2 : i]), qvals, (DATA_TYPE > 0) ? (valid_items+1)/2 : valid_items);
+  }
+}
+
+template<typename T, int TILE_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
+__global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n)
+{
+
+  const int n_load = (gridDim.x * TILE_SIZE);
+  int valid_items_load = 0;
+  int valid_items_store = 0;
+  const int base_idx = (blockIdx.x * TILE_SIZE);
+
+  T vals[NUM_PER_TH*((DATA_TYPE > 0) ? 2 : 1)];
+  unsigned char qvals[NUM_PER_TH];
+  float local_abs_max = -FLT_MAX;
+
+  typedef cub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+  typedef cub::BlockStore<T, THREADS, NUM_PER_TH*((DATA_TYPE > 0) ? 2 : 1), cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+  __shared__ typename LoadChar::TempStorage loadchar;
+  __shared__ typename StoreT::TempStorage storet;
+
+  for (unsigned int i = base_idx; i < n_load; i += gridDim.x*TILE_SIZE)
+  {
+    if(DATA_TYPE > 0)
+    {
+      valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i;
+      valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2;
+    }
+    else
+    {
+      valid_items_load = n - i > TILE_SIZE ? TILE_SIZE : n - i;
+      valid_items_store = n - i > TILE_SIZE ? TILE_SIZE : n - i;
+    }
+    local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(blocksize)]);
+
+    __syncthreads();
+    LoadChar(loadchar).Load(&(A[i]), qvals, valid_items_load, 128);
+
+    switch(DATA_TYPE)
+    {
+        case General8bit:
+          // load code through read-only cache via __ldg
+          #pragma unroll NUM_PER_TH
+          for(int j = 0; j < NUM_PER_TH; j++)
+            vals[j] = __ldg(&code[qvals[j]])*local_abs_max;
+          break;
+        case FP4:
+          #pragma unroll NUM_PER_TH
+          for(int j = 0; j < NUM_PER_TH; j++)
+          {
+            vals[j*2] = dDequantizeFP4Tree(qvals[j] >> 4, local_abs_max);
+            vals[j*2 + 1] = dDequantizeFP4Tree(qvals[j] & 0x0F, local_abs_max);
+          }
+          break;
+        case NF4:
+          #pragma unroll NUM_PER_TH
+          for(int j = 0; j < NUM_PER_TH; j++)
+          {
+            vals[j*2] = dDequantizeNF4(qvals[j] >> 4)* local_abs_max;
+            vals[j*2 + 1] = dDequantizeNF4(qvals[j] & 0x0F)* local_abs_max;
+          }
+          break;
+    }
+
+    __syncthreads();
+    StoreT(storet).Store(&(out[(DATA_TYPE > 0) ? i*2 : i]), vals, valid_items_store);
+  }
+}
+
+__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n)
+{
+	const unsigned int numThreads = blockDim.x * gridDim.x;
+	const int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	__shared__ float smem_code[256];
+	if(threadIdx.x < 256)
+	{
+		smem_code[threadIdx.x] = code[threadIdx.x];
+	}
+
+	__syncthreads();
+
+	for (int i = idx;i < n; i += numThreads)
+	{
+		out[i] = smem_code[A[i]];
+	}
+}
+
+
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__launch_bounds__(BLOCK_SIZE/NUM_VALS, 1)
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
+                float* state1, float* state2, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n)
+{
+
+  const int n_full = (BLOCK_SIZE*(n/BLOCK_SIZE)) + (n % BLOCK_SIZE == 0 ? 0 : BLOCK_SIZE);
+  const int base_idx = (blockIdx.x * blockDim.x * NUM_VALS);
+  int valid_items = 0;
+
+  T g_vals[NUM_VALS];
+
+  float s1_vals[NUM_VALS];
+  float s2_vals[NUM_VALS];
+
+  const float correction1 = 1.0f/(1.0f - powf(beta1, step));
+  const float correction2 = 1.0f/(1.0f - powf(beta2, step));
+
+  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
+
+  __shared__ union {
+      typename Load::TempStorage load;
+      typename LoadFloat::TempStorage loadf;
+      typename BlockReduce::TempStorage reduce;
+  } temp_storage;
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+  {
+      valid_items = n - i >= (BLOCK_SIZE) ? (BLOCK_SIZE) : n - i;
+
+      __syncthreads();
+      Load(temp_storage.load).Load(&(g[i]), g_vals, valid_items, 0.0f);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state1[i]), s1_vals, valid_items, 0.0f);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state2[i]), s2_vals, valid_items, 0.0f);
+
+      # pragma unroll NUM_VALS
+      for(unsigned int j = 0; j < NUM_VALS; j++)
+        g_vals[j] = gnorm_scale*((float)g_vals[j]);
+
+      # pragma unroll NUM_VALS
+      for(unsigned int j = 0; j < NUM_VALS; j++)
+      {
+          switch(OPTIMIZER)
+          {
+              case ADAM:
+                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
+                  s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
+                  s1_vals[j] *= correction1;
+                  s2_vals[j] *= correction2;
+                  s1_vals[j] = s1_vals[j]/(sqrtf(s2_vals[j])+eps); // update
+                  s1_vals[j] *= s1_vals[j]; // update l2 norm (update*update)
+                  break;
+          }
+      }
+
+      # pragma unroll NUM_VALS-1
+      for(unsigned int j = 1; j < NUM_VALS; j++)
+          s1_vals[0] += s1_vals[j];
+
+      __syncthreads();
+      s1_vals[0] = BlockReduce(temp_storage.reduce).Sum(s1_vals[0]);
+
+      if(threadIdx.x == 0)
+        atomicAdd(&unorm[0], s1_vals[0]);
+
+      __syncwarp();
+  }
+}
+
+
+
+#define NUM_PER_THREAD 4
+
+template<typename T, int OPTIMIZER>
+__launch_bounds__(TH, 1)
+__global__ void kOptimizer32bit2State(T* g, T* p,
+                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n)
+{
+
+  const int n_full = ((TH*NUM_PER_THREAD)*(n/(TH*NUM_PER_THREAD))) + (n % (TH*NUM_PER_THREAD) == 0 ? 0 : (TH*NUM_PER_THREAD));
+  const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD);
+  int valid_items = 0;
+  float update_scale = 0.0f;
+  T g_vals[NUM_PER_THREAD];
+  T p_vals[NUM_PER_THREAD];
+
+  float s1_vals[NUM_PER_THREAD];
+  float s2_vals[NUM_PER_THREAD];
+
+  const float correction1 = 1.0f - powf(beta1, step);
+  const float correction2 = sqrtf(1.0f - powf(beta2, step));
+  const float step_size = -lr*correction2/correction1;
+
+  if(max_unorm > 0.0f)
+  {
+    update_scale = max_unorm > 0.0f ? sqrtf(unorm[0]) : 1.0f;
+    if(update_scale > max_unorm*param_norm){ update_scale = (max_unorm*param_norm)/update_scale; }
+    else{ update_scale = 1.0f; }
+  }
+  else{ update_scale = 1.0f; }
+
+  typedef cub::BlockLoad<T, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef cub::BlockStore<T, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> Store;
+
+  typedef cub::BlockLoad<float, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef cub::BlockStore<float, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
+
+  __shared__ union {
+      typename Load::TempStorage load;
+      typename Store::TempStorage store;
+      typename LoadFloat::TempStorage loadf;
+      typename StoreFloat::TempStorage storef;
+  } temp_storage;
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*TH*NUM_PER_THREAD)
+  {
+      valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+
+      __syncthreads();
+      Load(temp_storage.load).Load(&(g[i]), g_vals, valid_items);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state1[i]), s1_vals, valid_items);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state2[i]), s2_vals, valid_items);
+      __syncthreads();
+      Load(temp_storage.load).Load(&(p[i]), p_vals, valid_items);
+
+      # pragma unroll 4
+      for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
+        g_vals[j] = gnorm_scale*((float)g_vals[j]);
+
+      # pragma unroll 4
+      for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
+      {
+          switch(OPTIMIZER)
+          {
+              case ADAM:
+									if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
+									{
+										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
+										s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
+										p_vals[j] = ((float)p_vals[j]) + (update_scale*step_size*(s1_vals[j]/(sqrtf(s2_vals[j])+(eps*correction2))));
+
+                    if(weight_decay > 0.0f)
+                        p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay));
+									}
+                  break;
+          }
+      }
+
+      __syncthreads();
+      Store(temp_storage.store).Store(&(p[i]), p_vals, valid_items);
+      __syncthreads();
+      StoreFloat(temp_storage.storef).Store(&(state1[i]), s1_vals, valid_items);
+      __syncthreads();
+      StoreFloat(temp_storage.storef).Store(&(state2[i]), s2_vals, valid_items);
+  }
+}
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__launch_bounds__(BLOCK_SIZE/NUM_VALS, 1)
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
+                float* state1, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n)
+{
+
+  const int n_full = (BLOCK_SIZE*(n/BLOCK_SIZE)) + (n % BLOCK_SIZE == 0 ? 0 : BLOCK_SIZE);
+  const int base_idx = (blockIdx.x * blockDim.x * NUM_VALS);
+  int valid_items = 0;
+
+  T g_vals[NUM_VALS];
+
+  float s1_vals[NUM_VALS];
+
+  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
+
+  __shared__ union {
+      typename Load::TempStorage load;
+      typename LoadFloat::TempStorage loadf;
+      typename BlockReduce::TempStorage reduce;
+  } temp_storage;
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+  {
+      valid_items = n - i >= (BLOCK_SIZE) ? (BLOCK_SIZE) : n - i;
+
+      __syncthreads();
+      Load(temp_storage.load).Load(&(g[i]), g_vals, valid_items, 0.0f);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state1[i]), s1_vals, valid_items, 0.0f);
+
+      # pragma unroll NUM_VALS
+      for(unsigned int j = 0; j < NUM_VALS; j++)
+        g_vals[j] = gnorm_scale*((float)g_vals[j]);
+
+      # pragma unroll NUM_VALS
+      for(unsigned int j = 0; j < NUM_VALS; j++)
+      {
+          switch(OPTIMIZER)
+          {
+              case MOMENTUM:
+                  if(step == 1)
+                    s1_vals[j] = (float)g_vals[j]; // state update
+                  else
+                    s1_vals[j] = s1_vals[j]*beta1 + ((float)g_vals[j]); // state update
+                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
+                  break;
+              case LION:
+                  s1_vals[j] = s1_vals[j]*beta2 + ((1.0f-beta2)*(float)g_vals[j]); // state update
+                  break;
+              case RMSPROP:
+                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*((float)g_vals[j])*((float)g_vals[j])); // state update
+                  s1_vals[j] = __fdividef((float)g_vals[j],sqrtf(s1_vals[j])+eps); // update value
+                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
+                  break;
+              case ADAGRAD:
+                  s1_vals[j] = s1_vals[j] + ((float)g_vals[j])*((float)g_vals[j]); // state update
+                  s1_vals[j] = __fdividef((float)g_vals[j],sqrtf(s1_vals[j])+eps); // update value
+                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
+                  break;
+          }
+      }
+
+      # pragma unroll
+      for(unsigned int j = 1; j < NUM_VALS; j++)
+        s1_vals[0] += s1_vals[j];
+
+      __syncthreads();
+      s1_vals[0] = BlockReduce(temp_storage.reduce).Sum(s1_vals[0], valid_items);
+
+      if(threadIdx.x == 0)
+        atomicAdd(&unorm[0], s1_vals[0]);
+
+      __syncwarp();
+  }
+}
+
+template<typename T, int OPTIMIZER>
+__launch_bounds__(TH, 1)
+__global__ void kOptimizer32bit1State(T *g, T *p,
+                float *state1, float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n)
+{
+
+  const int n_full = ((TH*NUM_PER_THREAD)*(n/(TH*NUM_PER_THREAD))) + (n % (TH*NUM_PER_THREAD) == 0 ? 0 : (TH*NUM_PER_THREAD));
+  const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD);
+  int valid_items = 0;
+  float update_scale = 0.0f;
+
+  if(max_unorm > 0.0f)
+  {
+    update_scale = max_unorm > 0.0f ? sqrtf(unorm[0]) : 1.0f;
+    if(update_scale > max_unorm*param_norm+eps){ update_scale = (max_unorm*param_norm+eps)/update_scale; }
+    else{ update_scale = 1.0f; }
+  }
+  else{ update_scale = 1.0f; }
+
+  T g_vals[NUM_PER_THREAD];
+  T p_vals[NUM_PER_THREAD];
+
+  float s1_vals[NUM_PER_THREAD];
+
+  typedef cub::BlockLoad<T, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef cub::BlockStore<T, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> Store;
+
+  typedef cub::BlockLoad<float, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef cub::BlockStore<float, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
+
+  __shared__ union {
+      typename Load::TempStorage load;
+      typename Store::TempStorage store;
+      typename LoadFloat::TempStorage loadf;
+      typename StoreFloat::TempStorage storef;
+  } temp_storage;
+
+  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*TH*NUM_PER_THREAD)
+  {
+      valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+
+      __syncthreads();
+      Load(temp_storage.load).Load(&(g[i]), g_vals, valid_items);
+      __syncthreads();
+      LoadFloat(temp_storage.loadf).Load(&(state1[i]), s1_vals, valid_items);
+      __syncthreads();
+      Load(temp_storage.load).Load(&(p[i]), p_vals, valid_items);
+
+      # pragma unroll 4
+      for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
+      {
+        g_vals[j] = gnorm_scale*((float)g_vals[j]);
+        if(weight_decay > 0.0f)
+          g_vals[j] = (float)g_vals[j] + (((float)p_vals[j])*weight_decay);
+      }
+
+      # pragma unroll 4
+      for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
+      {
+					if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
+					{
+						switch(OPTIMIZER)
+						{
+								case MOMENTUM:
+										if(step == 1)
+											s1_vals[j] = (float)g_vals[j];
+										else
+											s1_vals[j] = s1_vals[j]*beta1 + ((float)g_vals[j]);
+
+										p_vals[j] = ((float)p_vals[j]) + update_scale*(-lr*(s1_vals[j]));
+										break;
+								case LION:
+										p_vals[j] = ((float)p_vals[j]) - update_scale*(lr*sgn(((float)s1_vals[j])*beta1 + ((1.0f-beta1)*((float)g_vals[j]))));
+										s1_vals[j] = s1_vals[j]*beta2 + ((1.0f-beta2)*((float)g_vals[j]));
+										break;
+								case RMSPROP:
+										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*((float)g_vals[j])*((float)g_vals[j]));
+										p_vals[j] = ((float)p_vals[j]) - update_scale*(lr*__fdividef((float)g_vals[j],sqrtf((float)s1_vals[j])+eps));
+										break;
+								case ADAGRAD:
+										s1_vals[j] = s1_vals[j] + ((float)g_vals[j])*((float)g_vals[j]);
+										p_vals[j] = ((float)p_vals[j]) - lr*__fdividef((float)g_vals[j],sqrtf((float)s1_vals[j])+eps);
+										break;
+						}
+					}
+      }
+
+      __syncthreads();
+      Store(temp_storage.store).Store(&(p[i]), p_vals, valid_items);
+      __syncthreads();
+      StoreFloat(temp_storage.storef).Store(&(state1[i]), s1_vals, valid_items);
+  }
+}
+
+
+#define NUM8BIT 16
+#define NUM_THREADS 256
+#define NUM_PER_BLOCK 4096
+
+template<typename T, int OPTIMIZER>
+__global__ void
+__launch_bounds__(NUM_THREADS, 2)
+kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                const float gnorm_scale, const int n)
+{
+    const int n_full = gridDim.x * NUM_PER_BLOCK;
+    const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD);
+    int valid_items = n - (blockIdx.x*NUM_PER_BLOCK) > NUM_PER_BLOCK ? NUM_PER_BLOCK : n - (blockIdx.x*NUM_PER_BLOCK);
+    float g_val = 0.0f;
+    float local_max_s1 = -FLT_MAX;
+    float local_max_s2 = -FLT_MAX;
+    float local_unorm = 0.0f;
+
+    float s2_vals[NUM8BIT];
+    float s1_vals[NUM8BIT];
+    T g_vals[NUM8BIT];
+    unsigned char m_c1[NUM8BIT];
+    unsigned char r_c2[NUM8BIT];
+
+    typedef cub::BlockLoad<T, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
+    typedef hipcub::BlockReduce<float, NUM_THREADS> BlockReduce;
+
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadUInt8::TempStorage loadc;
+        typename BlockReduce::TempStorage reduce;
+    } temp_storage;
+
+    __shared__ float smem_quantiles1[256];
+    __shared__ float smem_quantiles2[256];
+
+    if(threadIdx.x < 256)
+    {
+        smem_quantiles1[threadIdx.x] = quantiles1[threadIdx.x];
+        smem_quantiles2[threadIdx.x] = quantiles2[threadIdx.x];
+    }
+
+    __syncthreads();
+
+    for (unsigned int i = base_idx; i < n_full; i += NUM_THREADS*gridDim.x*NUM8BIT)
+    {
+        valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadUInt8(temp_storage.loadc).Load(&(state1[i]), m_c1, valid_items, 128);
+        __syncthreads();
+        LoadUInt8(temp_storage.loadc).Load(&(state2[i]), r_c2, valid_items, 128);
+        __syncthreads();
+
+        #pragma unroll 16
+        for(int j = 0; j < NUM8BIT; j++)
+        {
+            g_val = g_vals[j];
+            g_val *= gnorm_scale;
+            s1_vals[j] = smem_quantiles1[m_c1[j]]*max1[0]*beta1;
+            s1_vals[j] += (1.0f-beta1)*g_val;
+            local_max_s1 = fmaxf(local_max_s1, fabsf(s1_vals[j]));
+        }
+
+        #pragma unroll 16
+        for(int j = 0; j < NUM8BIT; j++)
+        {
+            g_val = g_vals[j];
+            g_val *= gnorm_scale;
+            s2_vals[j] = smem_quantiles2[r_c2[j]]*max2[0]*beta2;
+            s2_vals[j] += (1.0f-beta2)*g_val*g_val;
+            local_max_s2 = fmaxf(local_max_s2, fabsf(s2_vals[j]));
+        }
+
+        if(unorm != NULL)
+        {
+          #pragma unroll 16
+          for(int j = 0; j < NUM8BIT; j++)
+          {
+            float correction1 = __fdividef(1.0f, 1.0f - powf(beta1, step));
+            float correction2 = __fdividef(1.0f, 1.0f - powf(beta2, step));
+            s1_vals[j] *= correction1;
+            s2_vals[j] *= correction2;
+            float update_val = s1_vals[j]/(sqrtf(s2_vals[j])+eps); // update
+            local_unorm += update_val*update_val;
+          }
+        }
+    }
+
+    __syncthreads();
+    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, hipcub::Max(), valid_items);
+    __syncthreads();
+    local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, hipcub::Max(), valid_items);
+    if(unorm != NULL)
+    {
+      __syncthreads();
+      local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, hipcub::Sum(), valid_items);
+    }
+
+    if(threadIdx.x == 0)
+    {
+        atomicMax(&new_max1[0], local_max_s1);
+        atomicMax(&new_max2[0], local_max_s2);
+        if(unorm != NULL){ atomicAdd(&unorm[0], local_unorm); }
+    }
+}
+
+#define NUM_PER_THREAD2 4
+#define NUM_THREADS2 1024
+#define NUM_PER_BLOCK2 4096
+
+template<typename T, int OPTIMIZER>
+__global__ void
+__launch_bounds__(NUM_THREADS2, 1)
+kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
+                const float *unorm, const float max_unorm, const float param_norm, \
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay,
+                const float gnorm_scale, const int n)
+{
+
+    const int n_full = (blockDim.x * gridDim.x)*NUM_PER_THREAD2;
+    const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD2);
+    int valid_items = 0;
+    float g_val = 0.0f;
+    float s1_vals[NUM_PER_THREAD2];
+    float s2_vals[NUM_PER_THREAD2];
+    const float correction1 = 1.0f - powf(beta1, step);
+    const float correction2 = sqrtf(1.0f - powf(beta2, step));
+    const float step_size = -lr*correction2/correction1;
+    //const float step_size = -lr*correction2/correction1;
+    float new_max_val1 = 1.0f/new_max1[0];
+    float new_max_val2 = 1.0f/new_max2[0];
+    float update_scale = 1.0f;
+
+    if(max_unorm > 0.0f)
+    {
+      update_scale = max_unorm > 0.0f ? sqrtf(unorm[0]) : 1.0f;
+      if(update_scale > max_unorm*param_norm){ update_scale = (max_unorm*param_norm)/update_scale; }
+      else{ update_scale = 1.0f; }
+    }
+    else{ update_scale = 1.0f; }
+
+    unsigned char c1s[NUM_PER_THREAD2];
+    unsigned char c2s[NUM_PER_THREAD2];
+    T p_vals[NUM_PER_THREAD2];
+    T g_vals[NUM_PER_THREAD2];
+    typedef cub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+
+    typedef cub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef cub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+    __shared__ float smem_quantiles1[256];
+    __shared__ float smem_quantiles2[256];
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadChar::TempStorage loadc;
+        typename StoreChar::TempStorage storec;
+        typename StoreT::TempStorage storeh;
+    } temp_storage;
+
+    if(threadIdx.x < 512)
+    {
+        if(threadIdx.x < 256)
+            smem_quantiles1[threadIdx.x] = quantiles1[threadIdx.x];
+        else
+            smem_quantiles2[threadIdx.x-256] = quantiles2[threadIdx.x-256];
+    }
+
+    __syncthreads();
+
+    for (unsigned int i = base_idx; i < n_full; i += gridDim.x*NUM_THREADS2*NUM_PER_THREAD2)
+    {
+        valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state1[i]), c1s, valid_items, 128);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state2[i]), c2s, valid_items, 0);
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(p[i]), p_vals, valid_items);
+
+        if((i + (threadIdx.x*NUM_PER_THREAD2) + NUM_PER_THREAD2) > n){ continue; }
+
+        # pragma unroll 4
+        for(unsigned int j = 0; j < NUM_PER_THREAD2; j++)
+        {
+            g_val = float(g_vals[j]);
+            g_val *= gnorm_scale;
+            s1_vals[j] = smem_quantiles1[c1s[j]];
+            s1_vals[j] = s1_vals[j]*max1[0];
+
+            s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val));
+
+            c1s[j] = dQuantize<0>(smem_quantiles1, 0.0f, s1_vals[j]*new_max_val1);
+
+            // make sure state1 term has still the same sign after quantization
+            // (not needed for state2 term which has only positive values)
+            if(signbit(smem_quantiles1[c1s[j]]) != signbit(s1_vals[j]))
+            {
+              if(s1_vals[j] > 0.0f)
+                  c1s[j] += 1;
+              else
+                  c1s[j] -= 1;
+            }
+
+            s2_vals[j] = smem_quantiles2[c2s[j]];
+            s2_vals[j] = s2_vals[j]*max2[0];
+            s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
+            c2s[j] = dQuantize<0>(smem_quantiles2, 0.0f, s2_vals[j]*new_max_val2);
+        }
+
+        # pragma unroll 4
+        for(unsigned int j = 0; j < NUM_PER_THREAD2; j++)
+        {
+            p_vals[j] = (T)(((float)p_vals[j]) + ((update_scale*step_size*(s1_vals[j]/(sqrtf(s2_vals[j])+(correction2*eps))))));
+            if(weight_decay > 0.0f)
+                p_vals[j] = update_scale*((float)p_vals[j])*(1.0f-(lr*weight_decay));
+        }
+
+        StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items);
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state1[i]), c1s, valid_items);
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state2[i]), c2s, valid_items);
+        __syncthreads();
+    }
+}
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+__launch_bounds__(NUM_THREADS, 2)
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
+                const float weight_decay,
+                const float gnorm_scale, const int n)
+{
+    const int n_full = gridDim.x * NUM_PER_BLOCK;
+    const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD);
+    int valid_items = n - (blockIdx.x*NUM_PER_BLOCK) > NUM_PER_BLOCK ? NUM_PER_BLOCK : n - (blockIdx.x*NUM_PER_BLOCK);
+    float g_val = 0.0f;
+    float local_max_s1 = -FLT_MAX;
+    float local_unorm = 0.0f;
+
+    float s1_vals[NUM8BIT];
+    T g_vals[NUM8BIT];
+    unsigned char m_c1[NUM8BIT];
+
+    typedef cub::BlockLoad<T, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
+    typedef hipcub::BlockReduce<float, NUM_THREADS> BlockReduce;
+
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadUInt8::TempStorage loadc;
+        typename BlockReduce::TempStorage reduce;
+    } temp_storage;
+
+    __shared__ float smem_quantiles1[256];
+
+    if(threadIdx.x < 256)
+      smem_quantiles1[threadIdx.x] = quantiles1[threadIdx.x];
+
+    __syncthreads();
+
+    for (unsigned int i = base_idx; i < n_full; i += gridDim.x*NUM_THREADS*NUM8BIT)
+    {
+        valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadUInt8(temp_storage.loadc).Load(&(state1[i]), m_c1, valid_items, 128);
+
+        #pragma unroll 16
+        for(int j = 0; j < NUM8BIT; j++)
+        {
+            g_val = g_vals[j];
+            g_val *= gnorm_scale;
+            s1_vals[j] = smem_quantiles1[m_c1[j]]*max1[0];
+            switch(OPTIMIZER)
+            {
+                case MOMENTUM:
+                    if(step == 1)
+                      s1_vals[j] = (float)g_vals[j];
+                    else
+                      s1_vals[j] = s1_vals[j]*beta1 + ((float)g_vals[j]);
+                    if(unorm != NULL)
+                      local_unorm += s1_vals[j]*s1_vals[j];
+                    break;
+              case LION:
+                  s1_vals[j] = s1_vals[j]*beta2 + ((1.0f-beta2)*g_val);
+                  break;
+              case RMSPROP:
+                    s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
+                  break;
+            }
+
+            local_max_s1 = fmaxf(local_max_s1, fabsf(s1_vals[j]));
+        }
+    }
+
+    __syncthreads();
+    local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, hipcub::Max(), valid_items);
+    if(threadIdx.x == 0){ atomicMax(&new_max1[0], local_max_s1); }
+    if(unorm != NULL)
+    {
+      __syncthreads();
+      local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, hipcub::Sum(), valid_items);
+      if(threadIdx.x == 0){ atomicAdd(&unorm[0], local_unorm); }
+    }
+
+}
+
+template<typename T, int OPTIMIZER>
+__global__ void
+__launch_bounds__(1024, 1)
+kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
+                float weight_decay,
+                const float gnorm_scale, const int n)
+{
+
+    const int n_full = (blockDim.x * gridDim.x)*NUM_PER_THREAD2;
+    const int base_idx = (blockIdx.x * blockDim.x * NUM_PER_THREAD2);
+    int valid_items = 0;
+    float g_val = 0.0f;
+    float s1_vals[NUM_PER_THREAD2];
+    float new_max_val1 = 1.0f/new_max1[0];
+    float update_scale = 1.0f;
+
+    if(max_unorm > 0.0f)
+    {
+      update_scale = max_unorm > 0.0f ? sqrtf(unorm[0]) : 1.0f;
+      if(update_scale > max_unorm*param_norm){ update_scale = (max_unorm*param_norm)/update_scale; }
+      else{ update_scale = 1.0f; }
+    }
+    else{ update_scale = 1.0f; }
+
+    unsigned char c1s[NUM_PER_THREAD2];
+    T p_vals[NUM_PER_THREAD2];
+    T g_vals[NUM_PER_THREAD2];
+    typedef cub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+
+    typedef cub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef cub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+    __shared__ float smem_quantiles1[256];
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadChar::TempStorage loadc;
+        typename StoreChar::TempStorage storec;
+        typename StoreT::TempStorage storeh;
+    } temp_storage;
+
+    if(threadIdx.x < 256)
+        smem_quantiles1[threadIdx.x] = quantiles1[threadIdx.x];
+
+    __syncthreads();
+
+    for (unsigned int i = base_idx; i < n_full; i += gridDim.x*NUM_THREADS2*NUM_PER_THREAD2)
+    {
+        valid_items = n - i >= (TH*NUM_PER_THREAD) ? (TH*NUM_PER_THREAD) : n - i;
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state1[i]), c1s, valid_items, 128);
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(p[i]), p_vals, valid_items);
+
+        if((i + (threadIdx.x*NUM_PER_THREAD2) + NUM_PER_THREAD2) > n){ continue; }
+
+        # pragma unroll 4
+        for(unsigned int j = 0; j < NUM_PER_THREAD2; j++)
+        {
+            g_val = float(g_vals[j]);
+            g_val *= gnorm_scale;
+
+            if(weight_decay > 0.0f) {
+              switch(OPTIMIZER) {
+                case MOMENTUM:
+                case RMSPROP:
+                  g_val += ((float)p_vals[j])*weight_decay;
+                  break;
+                case LION:
+                  p_vals[j] = ((float)p_vals[j])*(1.0f-lr*weight_decay);
+                  break;
+              }
+            }
+
+            s1_vals[j] = smem_quantiles1[c1s[j]]*max1[0];
+
+            switch(OPTIMIZER)
+            {
+                case MOMENTUM:
+                  if(step == 1)
+                    s1_vals[j] = g_vals[j];
+                  else
+                    s1_vals[j] = s1_vals[j]*beta1 + ((float)g_vals[j]);
+
+                  p_vals[j] = ((float)p_vals[j]) + (-lr*update_scale*(s1_vals[j]));
+                  break;
+              case LION:
+                  p_vals[j] = ((float)p_vals[j]) - (lr*sgn(((float)s1_vals[j])*beta1 + ((1.0f-beta1)*((float)g_val))));
+                  s1_vals[j] = s1_vals[j]*beta2 + ((1.0f-beta2)*g_val);
+                  break;
+              case RMSPROP:
+                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
+                  p_vals[j] = ((float)p_vals[j]) - (lr*__fdividef(g_val,sqrtf(s1_vals[j])+eps));
+                  break;
+            }
+
+            c1s[j] = dQuantize<0>(smem_quantiles1, 0.0f, s1_vals[j]*new_max_val1);
+
+            // make sure state1 term has still the same sign after quantization
+            if(signbit(smem_quantiles1[c1s[j]]) != signbit(s1_vals[j]))
+            {
+              if(s1_vals[j] > 0.0f)
+                  c1s[j] += 1;
+              else
+                  c1s[j] -= 1;
+            }
+        }
+
+        StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items);
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state1[i]), c1s, valid_items);
+        __syncthreads();
+    }
+}
+
+
+template<typename T, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n)
+{
+  const int n_full = (BLOCK_SIZE*(n/BLOCK_SIZE)) + (n % BLOCK_SIZE == 0 ? 0 : BLOCK_SIZE);
+  int valid_items = 0;
+
+  typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
+  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+
+  __shared__ typename BlockReduce::TempStorage reduce;
+
+  __shared__ typename LoadT::TempStorage loadT;
+  T vals[NUM_VALS];
+  float local_sum = 0.0f;
+
+  for (unsigned int i = (blockIdx.x * BLOCK_SIZE); i < n_full; i += gridDim.x*BLOCK_SIZE)
+  {
+      valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
+      local_sum = 0.0f;
+
+      __syncthreads();
+      LoadT(loadT).Load(&(g[i]), vals, valid_items, (T)0.0f);
+
+     #pragma unroll NUM_VALS
+     for(int j = 0; j < NUM_VALS; j++)
+       local_sum += ((float)vals[j])*((float)vals[j]);
+
+    local_sum = BlockReduce(reduce).Sum(local_sum, valid_items);
+    if(threadIdx.x == 0)
+    {
+      if(step == 1)
+      {
+        // initialize with the same norm for all positions
+        //#pragma unroll 10
+        for(int j = 0; j < 100; j++)
+          atomicAdd(&gnorm_vec[j], local_sum);
+      }
+      else
+          atomicAdd(&gnorm_vec[step % 100], local_sum);
+    }
+
+  }
+}
+
+
+#define LANES 2
+#define QUAD 3
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__launch_bounds__(256, 3)
+__global__ void
+kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* absmax1, float* absmax2,
+                float weight_decay,
+                const float gnorm_scale, const bool skip_zeros, const int n)
+{
+
+    //const int n_full = n + (n%BLOCK_SIZE);
+    const int n_full = gridDim.x * BLOCK_SIZE;
+    const int base_idx = (blockIdx.x * BLOCK_SIZE);
+    int valid_items = 0;
+    float g_val = 0.0f;
+    float s1_vals[N_PER_TH];
+    float s2_vals[N_PER_TH];
+    // 2-5%
+    const float correction1 = 1.0f - __powf(beta1, step);
+    const float correction2 = sqrtf(1.0f -__powf(beta2, step));
+    const float step_size = __fdividef(-lr*correction2,correction1);
+    const int lane_id = threadIdx.x % LANES;
+    float new_local_abs_max1 = -FLT_MAX;
+    float new_local_abs_max2 = -FLT_MAX;
+    float quadrants1[QUAD];
+    float quadrants2[QUAD];
+
+    unsigned char c1s[N_PER_TH];
+    unsigned char c2s[N_PER_TH];
+    T g_vals[N_PER_TH];
+    T p_vals[N_PER_TH];
+    typedef cub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+
+    typedef cub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef cub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+    __shared__ float smem_quantiles1[LANES][257];
+    __shared__ float smem_quantiles2[LANES][257];
+    typedef hipcub::BlockReduce<float, BLOCK_SIZE/N_PER_TH> BlockReduce1;
+    typedef hipcub::BlockReduce<float, BLOCK_SIZE/N_PER_TH> BlockReduce2;
+    __shared__ typename BlockReduce1::TempStorage reduce1;
+    __shared__ typename BlockReduce2::TempStorage reduce2;
+    __shared__ float smem_exchange1[1];
+    __shared__ float smem_exchange2[1];
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadChar::TempStorage loadc;
+        typename StoreChar::TempStorage storec;
+        typename StoreT::TempStorage storeh;
+    } temp_storage;
+    // init: 0.2 -> 0.23
+
+    // 0.23 -> 0.23
+      smem_quantiles1[0][threadIdx.x] = quantiles1[threadIdx.x];
+      smem_quantiles2[0][threadIdx.x] = quantiles2[threadIdx.x];
+      # pragma unroll
+      for(unsigned int j = 1; j < LANES; j++)
+      {
+        smem_quantiles1[j][threadIdx.x] = smem_quantiles1[0][threadIdx.x];
+        smem_quantiles2[j][threadIdx.x] = smem_quantiles2[0][threadIdx.x];
+      }
+
+    __syncthreads();
+
+    #pragma unroll
+    for(int k = 0; k < QUAD; k++)
+    {
+      quadrants1[k] = smem_quantiles1[lane_id][(k*256/(QUAD+1)) + (256/(QUAD+1)-1)];
+      quadrants2[k] = smem_quantiles2[lane_id][(k*256/(QUAD+1)) + (256/(QUAD+1)-1)];
+    }
+
+
+    for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+    {
+        // loads: 0.23 -> 0.85/1.44
+        valid_items = n - i >= BLOCK_SIZE ? BLOCK_SIZE : n - i;
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state1[i]), c1s, valid_items, 128);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state2[i]), c2s, valid_items, 0);
+
+        new_local_abs_max1 = -FLT_MAX;
+        new_local_abs_max2 = -FLT_MAX;
+
+        //  update: 2.48/1.57 -> 2.51/1.60
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+        {
+            if(!isnan((float)g_vals[j]) && !isinf((float)g_vals[j]))
+						{
+							s2_vals[j] = smem_quantiles2[lane_id][c2s[j]]*absmax2[i/BLOCK_SIZE];
+              g_val = g_vals[j];
+              //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
+              //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
+              g_val *= gnorm_scale;
+              
+							s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
+
+							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
+							s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val));
+						}
+            else
+            {
+              s1_vals[j] = 0.0f;
+              s2_vals[j] = 0.0f;
+            }
+
+            new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j]));
+            new_local_abs_max2 = fmaxf(new_local_abs_max2, fabsf(s2_vals[j]));
+        }
+
+
+        //  reduce: 2.51/1.60 -> 2.67/1.69
+        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, hipcub::Max());
+        new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, hipcub::Max());
+
+        if(threadIdx.x == 0)
+        {
+          smem_exchange1[0] = new_local_abs_max1;
+          smem_exchange2[0] = new_local_abs_max2;
+        }
+
+        __syncthreads();
+
+        if(threadIdx.x == 0)
+        {
+          absmax1[i/BLOCK_SIZE] = new_local_abs_max1;
+          absmax2[i/BLOCK_SIZE] = new_local_abs_max2;
+        }
+        else
+        {
+          new_local_abs_max1 = smem_exchange1[0];
+          new_local_abs_max2 = smem_exchange2[0];
+        }
+
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(p[i]), p_vals, valid_items, (T)0.0f);
+        //  reduce: 2.67/1.69 -> 2.67/1.70
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+        {
+						//if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
+            if(!isnan((float)g_vals[j]) && !isinf((float)g_vals[j]))
+						{
+							p_vals[j] = (T)(((float)p_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
+							if(weight_decay > 0.0f)
+									p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay));
+						}
+        }
+
+        //  store: 0.85/1.44 -> 2.48/1.57
+        __syncthreads();
+        StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items);
+
+        //  quantizaztion: 2.67/1.70  -> 3.4/3.3
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+        {
+            c1s[j] = quantize_2D<1>(quadrants1, smem_quantiles1[lane_id], __fdividef(s1_vals[j],new_local_abs_max1));
+            c2s[j] = quantize_2D<0>(quadrants2, smem_quantiles2[lane_id], __fdividef(s2_vals[j],new_local_abs_max2));
+
+            // make sure state1 term has still the same sign after quantization
+            // (not needed for state2 term which has only positive values)
+            if(signbit(smem_quantiles1[lane_id][c1s[j]]) != signbit(s1_vals[j]))
+            {
+              if(s1_vals[j] > 0.0f)
+                  c1s[j] += 1;
+              else
+                  c1s[j] -= 1;
+            }
+        }
+
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state1[i]), c1s, valid_items);
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state2[i]), c2s, valid_items);
+    }
+}
+
+
+#define LANES 2
+#define QUAD 3
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__launch_bounds__(256, 3)
+__global__ void
+kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char* state1,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* absmax1,
+                float weight_decay,
+                const float gnorm_scale, const bool skip_zeros, const int n)
+{
+
+    //const int n_full = n + (n%BLOCK_SIZE);
+    const int n_full = gridDim.x * BLOCK_SIZE;
+    const int base_idx = (blockIdx.x * BLOCK_SIZE);
+    int valid_items = 0;
+    float g_val = 0.0f;
+    float s1_vals[N_PER_TH];
+    // 2-5%
+    const int lane_id = threadIdx.x % LANES;
+    float new_local_abs_max1 = -FLT_MAX;
+    float quadrants1[QUAD];
+
+    unsigned char c1s[N_PER_TH];
+    T g_vals[N_PER_TH];
+		T p_vals[N_PER_TH];
+
+    typedef cub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef cub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+
+    typedef cub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef cub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+    __shared__ float smem_quantiles1[LANES][257];
+    typedef hipcub::BlockReduce<float, BLOCK_SIZE/N_PER_TH> BlockReduce1;
+    __shared__ typename BlockReduce1::TempStorage reduce1;
+    __shared__ float smem_exchange1[1];
+
+    __shared__ union {
+        typename LoadT::TempStorage loadh;
+        typename LoadChar::TempStorage loadc;
+        typename StoreChar::TempStorage storec;
+        typename StoreT::TempStorage storeh;
+    } temp_storage;
+    // init: 0.2 -> 0.23
+
+    // 0.23 -> 0.23
+		smem_quantiles1[0][threadIdx.x] = quantiles1[threadIdx.x];
+		# pragma unroll
+		for(unsigned int j = 1; j < LANES; j++)
+			smem_quantiles1[j][threadIdx.x] = smem_quantiles1[0][threadIdx.x];
+
+    __syncthreads();
+
+    #pragma unroll
+    for(int k = 0; k < QUAD; k++)
+      quadrants1[k] = smem_quantiles1[lane_id][(k*256/(QUAD+1)) + (256/(QUAD+1)-1)];
+
+    for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+    {
+        // loads: 0.23 -> 0.85/1.44
+        valid_items = n - i >= BLOCK_SIZE ? BLOCK_SIZE : n - i;
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(g[i]), g_vals, valid_items, (T)0.0f);
+        __syncthreads();
+        LoadChar(temp_storage.loadc).Load(&(state1[i]), c1s, valid_items, 128);
+        __syncthreads();
+        LoadT(temp_storage.loadh).Load(&(p[i]), p_vals, valid_items, (T)0.0f);
+
+        new_local_abs_max1 = -FLT_MAX;
+
+        //  update: 2.48/1.57 -> 2.51/1.60
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+        {
+            g_val = float(g_vals[j]);
+            g_val *= gnorm_scale;
+            if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
+            {
+              if(weight_decay > 0.0f) {
+                switch(OPTIMIZER) {
+                  case MOMENTUM:
+                  case ADAGRAD:
+                  case RMSPROP:
+                    g_val += ((float)p_vals[j])*weight_decay;
+                    break;
+                  case LION:
+                    p_vals[j] = ((float)p_vals[j])*(1.0f-lr*weight_decay);
+                    break;
+                }
+              }
+
+							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
+
+							switch(OPTIMIZER)
+							{
+									case MOMENTUM:
+										if(step == 1)
+											s1_vals[j] = g_val;
+										else
+											s1_vals[j] = (s1_vals[j]*beta1) + g_val;
+										break;
+									case LION:
+										// here, using gvals[j] to store the gradient smoothed by beta1 for the following parameter update, before the momentum is updated by beta2
+										g_vals[j] = lr*sgn(((float)s1_vals[j])*beta1 + ((1.0f-beta1)*g_val));
+										s1_vals[j] = s1_vals[j]*beta2 + ((1.0f-beta2)*g_val);
+										break;
+									case RMSPROP:
+										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
+										break;
+									case ADAGRAD:
+										s1_vals[j] = s1_vals[j] + (g_val*g_val);
+										break;
+							}
+						}
+
+            new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j]));
+        }
+
+
+        //  reduce: 2.51/1.60 -> 2.67/1.69
+        new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, hipcub::Max());
+
+        if(threadIdx.x == 0)
+          smem_exchange1[0] = new_local_abs_max1;
+
+        __syncthreads();
+
+        if(threadIdx.x == 0)
+          absmax1[i/BLOCK_SIZE] = new_local_abs_max1;
+        else
+          new_local_abs_max1 = smem_exchange1[0];
+
+        //  reduce: 2.67/1.69 -> 2.67/1.70
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+				{
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
+						{
+							switch(OPTIMIZER)
+							{
+									case MOMENTUM:
+										p_vals[j] = ((float)p_vals[j]) - lr*(s1_vals[j]);
+										break;
+									case LION:
+										p_vals[j] = ((float)p_vals[j]) - ((float)g_vals[j]);
+										break;
+									case RMSPROP:
+										g_val = g_vals[j];
+										p_vals[j] = ((float)p_vals[j]) - lr*(__fdividef(g_val, sqrtf(s1_vals[j])+eps));
+										break;
+									case ADAGRAD:
+										g_val = g_vals[j];
+										p_vals[j] = ((float)p_vals[j]) - lr*(__fdividef(g_val, sqrtf(s1_vals[j])+eps));
+										break;
+							}
+						}
+				}
+
+        //  store: 0.85/1.44 -> 2.48/1.57
+        __syncthreads();
+        StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items);
+
+        //  quantizaztion: 2.67/1.70  -> 3.4/3.3
+        # pragma unroll N_PER_TH
+        for(unsigned int j = 0; j < N_PER_TH; j++)
+        {
+            c1s[j] = quantize_2D<1>(quadrants1, smem_quantiles1[lane_id], __fdividef(s1_vals[j],new_local_abs_max1));
+
+            // make sure state1 term has still the same sign after quantization
+            // (not needed for state2 term which has only positive values)
+            if(signbit(smem_quantiles1[lane_id][c1s[j]]) != signbit(s1_vals[j]))
+            {
+              if(s1_vals[j] > 0.0f)
+                  c1s[j] += 1;
+              else
+                  c1s[j] -= 1;
+            }
+        }
+
+        __syncthreads();
+        StoreChar(temp_storage.storec).Store(&(state1[i]), c1s, valid_items);
+    }
+}
+
+template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols)
+{
+  // 0. reset stats to -FLT_MAX
+  // 1. load row-by-row ITEMS_PER_THREAD (TILE_SIZE==THREADS*ITEMS_PER_THREAD)
+  // 2. compute col max (per thread); store in smem due to register pressure
+  // 3. compute row max (per block); store in smem to accumulate full global mem transation
+  // 4. store data via atomicMax
+
+  // each block loads TILE_COLs columns and TILE_ROW rows
+  // after reading a tile the row counter increase by TILE_ROWS
+  // the col counter reset after reading TILE_COL elements
+  const int base_row = ((blockIdx.x*TILE_COLS)/tiledCols)*TILE_ROWS;
+  // col increases by TILE_SIZE for each block and wraps back to 0 after tiledCols is reached
+  const int base_col = (blockIdx.x*TILE_COLS) % tiledCols;
+  const int base_idx = (base_row*cols) + base_col;
+  const int items_per_load = ITEMS_PER_THREAD*THREADS;
+
+  typedef cub::BlockLoad<T, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE> LoadT;
+  typedef hipcub::BlockReduce<float, THREADS> BlockRowReduce;
+  typedef hipcub::BlockReduce<int, THREADS> BlockRowSum;
+  typedef cub::BlockExchange<float, THREADS, ITEMS_PER_THREAD> BlockExchange;
+
+  __shared__ union {
+    typename BlockExchange::TempStorage exchange;
+    typename BlockRowReduce::TempStorage rowreduce;
+    typename BlockRowSum::TempStorage rowsum;
+    typename LoadT::TempStorage loadt;
+  } temp_storage;
+
+  __shared__ float smem_row_absmax_values[ITEMS_PER_THREAD*THREADS];
+  __shared__ int smem_row_nnz_values[TILE_ROWS];
+
+  half local_data[ITEMS_PER_THREAD];
+  float local_data_fp32[ITEMS_PER_THREAD];
+  float local_col_absmax_values[ITEMS_PER_THREAD];
+  int local_row_nnz_count = 0;
+  float row_absmax = -FLT_MAX;
+
+  // 0. reset stats to -FLT_MAX
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+  {
+    //smem_col_absmax_values[threadIdx.x + (j*THREADS)] = -FLT_MAX;
+    smem_row_absmax_values[threadIdx.x + (j*THREADS)] = -FLT_MAX;
+    smem_row_nnz_values[threadIdx.x + (j*THREADS)] = 0;
+  }
+
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    local_col_absmax_values[j] = -FLT_MAX;
+
+  __syncthreads();
+
+  int valid_items = cols - base_col > items_per_load ? items_per_load : cols - base_col;
+  int i = base_idx;
+  // we load row after row from the base_position
+  // 1. load row-by-row ITEMS_PER_THREAD (TILE_SIZE==THREADS*ITEMS_PER_THREAD)
+  for(int row = 0; row < TILE_ROWS; row++)
+  {
+    if(base_row+row >= rows){ break; }
+    local_row_nnz_count = 0;
+    i = base_idx + ((row)*cols);
+    // each thread gets data from the same column
+    __syncthreads();
+    LoadT(temp_storage.loadt).Load(&(A[i]), local_data, valid_items, __float2half(0.0f));
+
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      local_data[j] = fabsf(local_data[j]);
+
+
+    if(SPARSE_DECOMP)
+      #pragma unroll ITEMS_PER_THREAD
+      for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      {
+        if((float)local_data[j] >= nnz_threshold)
+        {
+          local_row_nnz_count += 1;
+          local_data[j] = 0.0f;
+        }
+      }
+
+    // 2. compute col max (per thread); store in smem due to register pressure
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      // take the col max for this row
+      // we use shared memory because register pressure is too high if we do this locally
+      //smem_col_absmax_values[threadIdx.x + (j*THREADS)] = fmaxf(smem_col_absmax_values[threadIdx.x + (j*THREADS)], __half2float(local_data[j]));
+      local_col_absmax_values[j] = fmaxf(local_col_absmax_values[j], __half2float(local_data[j]));
+
+    // 3. compute row max (per block); store in smem to accumulate full global mem transation
+
+    // this is slow as it uses extra registers, but we need this to be compatible with Kepler and Maxwell (no fp16 units)
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      local_data_fp32[j] = local_data[j];
+
+    __syncthreads();
+
+    row_absmax = (float)BlockRowReduce(temp_storage.rowreduce).Reduce(local_data_fp32, hipcub::Max());
+    if(SPARSE_DECOMP)
+    {
+      __syncthreads();
+      local_row_nnz_count = BlockRowSum(temp_storage.rowsum).Sum(local_row_nnz_count);
+    }
+    // we store the data temporarily in shared memory so we
+    // can execute a full atomic block transaction into global memory later
+    // we use a striped arrangement [0, 8, 16, 24, ..] for t0 for faster stores
+    if(threadIdx.x == 0)
+    {
+      smem_row_absmax_values[(row % ITEMS_PER_THREAD) + ((row/ITEMS_PER_THREAD)*ITEMS_PER_THREAD)] = row_absmax;
+      // each blockIdx.x process 16 rows and 64*4=256 columns -> we sum nnz over 256 columns and have 16 values per block
+      smem_row_nnz_values[row] = local_row_nnz_count;
+    }
+
+    __syncthreads();
+
+  }
+
+  // 4. store data via atomicMax
+  // to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
+  // into a striped arangement: [0, 8, 16, 24, ..] for t0
+  __syncthreads();
+  BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);
+
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    if(base_col+threadIdx.x+(j*THREADS) < cols)
+    {
+      float val = colStats[base_col+(threadIdx.x+(j*THREADS))];
+      if(val < local_col_absmax_values[j])
+        atomicMax(&colStats[base_col+(threadIdx.x+(j*THREADS))], local_col_absmax_values[j]);
+    }
+
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    if(base_row+threadIdx.x+(j*THREADS) < rows)
+    {
+      float val = rowStats[base_row+(threadIdx.x+(j*THREADS))];
+      if(val < smem_row_absmax_values[threadIdx.x+(j*THREADS)])
+        atomicMax(&rowStats[base_row+(threadIdx.x+(j*THREADS))], smem_row_absmax_values[threadIdx.x+(j*THREADS)]);
+    }
+
+    if(SPARSE_DECOMP)
+      if(threadIdx.x < TILE_ROWS)
+        nnz_count_row[blockIdx.x*TILE_ROWS+threadIdx.x+1] = smem_row_nnz_values[threadIdx.x];
+
+}
+
+template __global__ void kgetColRowStats<half, 64, 4, 16, 64*4, 0>(half * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
+template __global__ void kgetColRowStats<half, 64, 4, 16, 64*4, 1>(half * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
+
+#define MM_DEQUANT_CONST 6.200012e-05f //1.0f/(127.0f*127.0f)
+
+template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, float* newRowStats, float* newcolStats, half *__restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n)
+{
+
+  // Strategy: To dequantize we need to load col/row statistics. This can be very expensive
+  // since different row/col stats need to be loaded with each thread.
+  // (1, bad algorithm) Loading 32 items per thread would only occur 1 row load, but this increases register pressure
+  // and would lead to low global load utilization.
+  // (2, bad algorithm) If each thread loads some columns and multiple rows one needs to do lot of row loads
+  // for each thread and this is duplicated by a factor of 32/num-cols-per-thread.
+  // (3, good algorithm) Combining (1) and (2) we use sub-tiles of size 32xk in shared memory per threadblock.
+  // This allows for efficient row/col loading from shared memory within the tile.
+  // We can run for example 32x128 sub-tiles and warp-strided loads of 4 elements so that each thread has
+  // the same col statistic but needs to load 4 row stats from shared memory. To prevent bank conflicts
+  // we use a block-striped shared memory config [1, 31, 63, 95] so no bank conflicts happen during the
+  // shared memory loads.
+
+  // data is in 32 column-tile major with tile width 32 columns and numRows rows
+  // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
+  // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+  // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
+  // C2. Compute normalization values and store col values in register
+  // S1. Store C1 into 16-bit output
+  // S2. Store col/row statistics of new buffer in shared memory
+
+  // We allow for sub-tiles to span multiple col32 tiles. This is okay
+  // since the items per thread only rely on a single column statistic.
+
+
+  const int n_out = numRows*numCols;
+
+  int num_row_tiles = (numRows/SUBTILE_ROWS) + (numRows % SUBTILE_ROWS == 0 ? 0 : 1);
+  // we have tiles of size numRows*32, thus col only increases every numRows
+  // num_row_tiles is the tiles after which the column increases by 32
+  // blockIdx.x is the index of the current tile
+  int col = ((threadIdx.x % 32) + ((blockIdx.x/num_row_tiles)*32));
+  // base_row increases by SUBTILE_ROWS every block. It wraps back to zero once num_row_tiles is reached
+  int base_row = (blockIdx.x*SUBTILE_ROWS) % (num_row_tiles*SUBTILE_ROWS);
+
+  // SUBTILE_ROWS is independent from ITEMS_PER_THREAD is independent from THREADS
+  // subtiles have 32*SUBTILE_ROWS elements <= THREADS*ITEMS_PER_THREAD
+  // Total subtiles should be n/(32*SUBTILE_ROWS) where each subtile has SUBTILE_ROW*32/4 threads.
+  // For example for a 1024x1024 matrix with 128 SUBTILE_ROWS and 4 ITEMS_PER_THREAD we have
+  // 1024*1024/(128*32) = 256 tiles
+  // 256 tiles are 256*128*32/4 = 256*1024 threads
+
+  // 1. Figure out how index relates to the start of the sub-tile
+  // 2. Each thread < SUBTILE_ROWS calculates row index
+  // 3. Load striped and store in shared memory
+
+  int local_values[ITEMS_PER_THREAD];
+  half local_output[ITEMS_PER_THREAD];
+  float local_rowStats[ITEMS_PER_THREAD];
+  __shared__ float smem_rowStats[SUBTILE_ROWS];
+
+  typedef cub::BlockLoad<int, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> LoadInt32;
+  typedef cub::BlockExchange<int, THREADS, ITEMS_PER_THREAD> ExchangeInt32;
+  __shared__ typename LoadInt32::TempStorage loadint32;
+  __shared__ typename ExchangeInt32::TempStorage exchangeint32;
+
+
+  // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
+  float colStat = col >= numCols ? 0.0f : colStats[col];
+  float local_biasValue = ((bias == NULL) || (col >= numCols)) ? 0.0f : __half2float(bias[col]);
+  // no block loads for rows for now -- keep it simple
+  for(int j = threadIdx.x; j < SUBTILE_ROWS; j+=blockDim.x)
+  {
+    // todo: is this global mem access slow due to overlaps or does the L1 cache work well here?
+    int row = (base_row+j) % numRows; // wrap around
+    // each warp accesses the same element, for four consequitive elements
+    // todo: update description about striped shared memory, it is not needed
+    // rowidx: [0, 1, 2, 3...] and each warp reads ITEMS_PER_THREAD consequitive elements
+    smem_rowStats[j] = rowStats[row];
+  }
+  __syncthreads();
+
+
+  // each block processes SUBTILE_ROWS*32 elements
+  const int items_per_load = THREADS*ITEMS_PER_THREAD;
+  const int rows_per_load = items_per_load/32;
+
+  int subtile_base_row = (threadIdx.x / 32)*ITEMS_PER_THREAD; // row within the tile
+  int row_offset = 0;
+  // subtile_idx starts at the base_row*32 + the total offset for a full numRow*32 tile is passed
+  int subtile_start = (blockIdx.x/num_row_tiles)*(numRows*32) + (base_row*32);
+  for(int subtile_idx = subtile_start; subtile_idx < subtile_start + (SUBTILE_ROWS*32); subtile_idx+=items_per_load)
+  {
+    int valid_rows = numRows - (base_row+row_offset) > rows_per_load ? rows_per_load : numRows - (base_row+row_offset);
+    int valid_items = valid_rows*32;
+    if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
+      break;
+
+    // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+    LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
+    ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);
+
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      local_rowStats[j] = smem_rowStats[subtile_base_row+row_offset+j];
+
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*local_rowStats[j]*colStat) + local_biasValue);
+      //absmax_col = fmax(fabsf(local_output[j]), absmax_col);
+
+    // we store data in row major
+    // to store data efficiently, we want to use block exchange: [0, 32, 64, 92] -> [0, 1, 2, 3]
+    // so that each thread holds ITEMS_PER_THREAD consecutive items for each row
+    // this way throughput into storage is increased by a factor of ~2x
+    // for now we use a simple store
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    {
+      int outIdx = col + ((base_row+subtile_base_row+row_offset+j)*numCols);
+      if(outIdx< n_out && col < numCols)
+        out[outIdx] = local_output[j];
+    }
+
+    row_offset += rows_per_load;
+  }
+}
+
+
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols)
+{
+  // assumes TILE_SIZE == THREADS*ITEMS_PER_THREAD
+  // Each thread reads the same column but multiple rows
+  // Rows are loaded in shared memory and access is shared across the threadblock (broadcast)
+
+  // 0. Load row stats data into shared memory; load col stat (1 fixed per thread)
+  // 1. Load data row by row (should be at least with TILE_SIZE = 512)
+  // 2. quantize data with row/col stats
+  // 3. Store data (TILE_SIZE = 512 is a bit slow, but should still be close enough to good performance)
+
+  // each block loads TILE_COLs columns and TILE_ROW rows
+  // after reading a tile the row counter increase by TILE_ROWS
+  // the col counter reset after reading TILE_COL elements
+  const int base_row = ((blockIdx.x*TILE_COLS)/tiledCols)*TILE_ROWS;
+  // col increases by TILE_SIZE for each block and wraps back to 0 after tiledCols is reached
+  const int base_col = (blockIdx.x*TILE_COLS) % tiledCols;
+  const int base_idx = (base_row*cols) + base_col;
+  const int items_per_load = ITEMS_PER_THREAD*THREADS;
+
+  typedef cub::BlockLoad<half, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE> LoadHalf;
+  __shared__ typename LoadHalf::TempStorage loadhalf;
+  typedef cub::BlockStore<char, THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_VECTORIZE> StoreInt8;
+  __shared__ typename StoreInt8::TempStorage storeint8;
+
+  __shared__ float smem_row_stats[TILE_ROWS];
+  __shared__ unsigned int smem_nnz_row_idx[TILE_ROWS];
+
+  half local_data[ITEMS_PER_THREAD];
+  float local_col_stats[ITEMS_PER_THREAD];
+  char local_quantized_data[ITEMS_PER_THREAD];
+
+  // 0. Load row stats data into shared memory; load col stat (1 fixed per thread)
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    if(base_col+(threadIdx.x*ITEMS_PER_THREAD) + j < cols)
+      local_col_stats[j] = __fdividef(127.0f, colStats[base_col+(threadIdx.x*ITEMS_PER_THREAD)+j]);
+
+  for(int i = threadIdx.x; i < TILE_ROWS; i+=blockDim.x)
+  {
+    if(base_row + i < rows)
+      smem_row_stats[i] = rowStats[base_row+i];
+
+    if(SPARSE_DECOMP)
+      smem_nnz_row_idx[i] = nnz_block_ptr[(TILE_ROWS*blockIdx.x) + i];
+  }
+  __syncthreads();
+
+  // we load row after row from the base_position
+  // 1. Load data row by row (should be at least with TILE_SIZE = 512)
+  for(int row = 0; row < TILE_ROWS; row++)
+  {
+    if(base_row + row >= rows){ break; }
+    int i = base_idx + (row*cols);
+    int valid_items = cols - base_col > items_per_load ? items_per_load : cols - base_col;
+
+
+    LoadHalf(loadhalf).Load(&(A[i]), local_data, valid_items, 0.0f);
+    float row_stat = __fdividef(127.0f, smem_row_stats[row]);
+
+    // 2. quantize data with row/col stats
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    {
+      // we already pre-normalized the col/row stat:
+      // what this does is float/absmax*127 = int8
+      if(SPARSE_DECOMP)
+      {
+        if(fabsf((float)local_data[j]) >= threshold)
+        {
+          local_quantized_data[j] = 0;
+
+					int old_idx = atomicInc(&smem_nnz_row_idx[row], UINT_MAX);
+
+          rowidx[old_idx] = base_row+row;
+          colidx[old_idx] = base_col+(threadIdx.x*ITEMS_PER_THREAD)+j;
+          val[old_idx] = local_data[j];
+        }
+				else
+				{
+					local_quantized_data[j] = (char)(rintf(__half2float(local_data[j])*row_stat));
+				}
+      }
+      else
+        local_quantized_data[j] = (char)(rintf(__half2float(local_data[j])*row_stat));
+    }
+
+    StoreInt8(storeint8).Store(&(out_row_normed[i]), local_quantized_data, valid_items);
+
+    // 2. quantize data with row/col stats
+    #pragma unroll ITEMS_PER_THREAD
+    for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    {
+      // we already pre-normalized the col/row stat:
+      // what this does is float/absmax*127 = int8
+			local_quantized_data[j] = (char)(rintf(__half2float(local_data[j])*local_col_stats[j]));
+    }
+
+    __syncthreads();
+    StoreInt8(storeint8).Store(&(out_col_normed[i]), local_quantized_data, valid_items);
+
+  }
+}
+
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols)
+{
+
+  // 0. Load data into 32*32 shared memory tiles
+  // 1. transpose / reorder in shared memory
+  // 2. store
+
+  // COL32 FORMAT:
+  // rows*32 tiles
+
+  // TURING FORMAT:
+  // 8*32 tiles with 4*4 subtiles
+  // the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*4 = 64 elements)
+  // the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
+  // the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
+  // the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
+  // index increases by 32
+
+  // AMPERE FORMAT:
+  // 32*32 tiles with 8*32 subtiles. The rows are interleaved in pairs of two rows with offset of 8 between pairs of two rows:
+	// row idx (each number stands for 32 values): [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
+  // the tiles are column-major ordered, so after 1024*1024 values we process: A[32:64, 0:32]
+
+
+  // To have efficient loads and stores if we transpose we need 128 consequitive bytes which at 1 byte are 128 values
+  // As such we need:
+  // at least 32*4 shared memory tiles for col32; preferably 32*32
+  // at least 32*6 shared memory tiles for col32_ampere: preferably 32*32
+  // at least 32*8 shared memory tiles for col4_turing: preferably 32*32
+  // for efficient loading of row major we need to load 128 elements and repeat this 32 items
+  // this would imply a 32x128 shared memory tile -> 4kb
+  // It is more efficient to have more than 1 warp, so with 64 threads we need 32x128 -> 8 kb
+  // we have 64k sharded mem per SM in Turing which is 8 blocks per SM which is 2*8 = 32 warps = 100% occupancy
+  // for turing and 50% for A100 and 75% for RTX 30s / A40 which is probably good enough
+  // register pressure should be low with: 8 registers from local memoryh per block and 64 registers per SM
+  //
+  // to make the shared memory work with that occupancy we might need to union the block loads/stores
+
+  // each block loads TILE_COLs columns and TILE_ROW rows
+  // after reading a tile the row counter increase by TILE_ROWS
+  // the col counter reset after reading TILE_COL elements
+  const int base_row = ((blockIdx.x*TILE_COLS)/tiledCols)*TILE_ROWS;
+  // col increases by TILE_SIZE for each block and wraps back to 0 after tiledCols is reached
+  const int base_col = (blockIdx.x*TILE_COLS) % tiledCols;
+  const int base_idx = (base_row*cols) + base_col;
+
+  // we load 128 bytes per warp with
+  // 32 rows for transposes that fill col32 types
+  // so that we can have contiguous stores
+  __shared__ char smem_data[32*33*ITEMS_PER_THREAD];
+  char local_data[ITEMS_PER_THREAD];
+  typedef cub::BlockExchange<char, THREADS, ITEMS_PER_THREAD> BlockExchange;
+
+  // we load row after row from the base_position
+  // Load data row by row
+  int warps = blockDim.x/32;
+  int warp_id = threadIdx.x/32;
+  int warp_lane = threadIdx.x % 32;
+  int offset = 0;
+
+  int smem_row = 0;
+  // each warp loads one row of 128 bytes
+  for(int row = warp_id; row < TILE_ROWS; row+=warps)
+  {
+    int i = base_idx + (row*cols);
+    // we load up to 128 bytes/items per load
+    int valid_items = cols - base_col > 32*ITEMS_PER_THREAD ? 32*ITEMS_PER_THREAD : cols - base_col;
+
+    // 0. Load data into 32*32 shared memory tiles
+    if(base_row + row < rows)
+    {
+      #pragma unroll ITEMS_PER_THREAD
+      for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      {
+        int col_idx = warp_lane+(j*32);
+        if(col_idx < valid_items)
+          local_data[j] = A[i+col_idx];
+        else
+          local_data[j] = 0;
+      }
+    }
+    else
+    {
+      #pragma unroll ITEMS_PER_THREAD
+      for(int j = 0; j < ITEMS_PER_THREAD; j++)
+        local_data[j] = 0;
+    }
+
+    if(TRANSPOSE)
+    {
+      #pragma unroll ITEMS_PER_THREAD
+      for(int j = 0; j < ITEMS_PER_THREAD; j++)
+      {
+        int local_col = (32*j)+warp_lane;
+        //int local_row = row;
+        // store as 256x32
+        smem_data[(local_col*33) + row] = local_data[j];
+      }
+    }
+    else
+    {
+      // treat smem as 32x256, that is 32 rows and 256 columns
+      #pragma unroll ITEMS_PER_THREAD
+      for(int j = 0; j < ITEMS_PER_THREAD; j++)
+        smem_data[row*32*ITEMS_PER_THREAD + (warp_lane) + (j*32)] = local_data[j];
+    }
+
+
+
+    smem_row += warps;
+
+    // 1. transpose / reorder in shared memory
+    if(smem_row % 32 == 0)
+    {
+      smem_row = 0;
+      __syncthreads();
+
+      for(int subrow = warp_id; subrow < 32; subrow+=warps)
+      {
+        for(int j = 0; j < ITEMS_PER_THREAD; j++)
+        {
+
+          switch(FORMAT)
+          {
+              case COL32:
+                if(TRANSPOSE)
+                {
+                  // data lies in shared memory in the following way:
+                  // row0 [col0 col1 ... col31]
+                  // row1 [col0 col1 ... col31]
+                  // ...
+                  //
+                  // As such we read consequtive entries with 256 threads (8rows x 32 columns)
+                  // as j increase, the row increase by a factor of 8
+                  // We load 8 rows per subrow loop, and subrow increase by 8 per loop
+                  // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
+                  const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
+                  const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
+                  //const int local_row =  warp_id; // each warp_id is one row
+                  //const int block_row = base_col; // block offset for row
+                  //const int local_col = warp_lane
+                  //const int global_col = base_row; // block offset for col
+                  if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
+                  {
+                    // each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+                    char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
+
+                    // each 32 columns we have new tile
+                    // each tile has size outRows*32 and base_row is done in increments of 32
+                    offset = base_row*outRows;
+                    out[offset + (base_col + jrow + subrow_loop_row)*32 + threadIdx.x] = data;
+                  }
+                }
+                else
+                {
+                  if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
+                  {
+                    offset = (base_col/32)*(32*rows);
+                    char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
+                    out[offset+(base_row+subrow)*32 + ((j)*rows*32)+warp_lane] = data;
+                  }
+                }
+                break;
+              case COL_TURING:
+                // TURING FORMAT:
+                // 8*32 tiles with 4*4 subtiles
+                // the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*4 = 64 elements)
+                // the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
+                // the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
+                // the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
+                // index increases by 32
+                //
+                // [0 0 0 0, 2 2 2 2, 4 4 4 4, 6 6 6 6, 0 0 0 0 ...]
+                if(TRANSPOSE)
+                {
+                  const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
+                  const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
+                  //const int local_row =  warp_id; // each warp_id is one row
+                  //const int block_row = base_col; // block offset for row
+                  //const int local_col = warp_lane
+                  //const int global_col = base_row; // block offset for col
+                  if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
+                  {
+                    // each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+                    char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
+
+                    // each 32 columns we have new tile
+                    // each tile has size 8*32 = 256 elements offset
+                    // for each row offset of 8 we increaes the tile first
+                    // after all rows are exhausted, we increase the col
+                    int row_offset = ((base_col+jrow+subrow_loop_row+warp_id)/8)*256; // global_row+jrow+subrow_loop_row+local_row, increase tile(=256) every 8 rows
+
+                    // we increase by row_tile_column every 32 columns
+                    // base_row increase in increments of 32
+                    //int row_tile_column = 256*outRows/8; // there are outRows/8 row tiles, and each tile is 256 elements
+                    //int col_offset = (base_row/32)*row_tile_column;
+                    // -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
+                    // 256*outRows/8*base_row/32 = outRows*base_row
+                    int col_offset = outRows*base_row;
+
+                    offset = row_offset+col_offset;
+
+                    // since we process even number of rows with each j (8) and with each subrow (8j) we can determine
+                    // odd or even rows with the warp_id (each warp processes one row)
+                    // the col is warp_lane (max 32 columns per row) and the row warp_id
+                    if(warp_id % 2 == 1)
+                      // odd
+                      offset += 128 + (warp_lane/4)*16 + (warp_lane%4) + (((warp_id%8)-1)*2);
+                    else
+                      // even
+                      offset += 0   + (warp_lane/4)*16 + (warp_lane%4) + ((warp_id%8)*2);
+
+                    out[offset] = data;
+                  }
+                }
+                else
+                {
+                  if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
+                  {
+                    char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
+                    // set offset designates the tile offset among the 8*32 tiles
+                    // we first increase rows and then columns. Since we load 128 columns at once
+                    // we increase the offset by outRows*32 every 32 columns
+                    // additionally, we increase the offset by 8*32=256 every 8 rows
+                    offset = ((base_col+(j*32))/32)*outRows*32 + (((base_row+subrow)/8)*256); // global offset (8x32 tile)
+                    // first 4 rows are reserved for even rows, [0, 2, 4, 6], the next 4 for odd
+                    // each of these has 32 values in total for 32*4 = 128 as offset if odd
+                    // every set of 4 columns increases the total offset by 16
+                    // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
+                    // this happends every 8 rows anew (subrow % 8)
+                    // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
+                    int subcol = warp_lane;
+
+                    // add local offset (4x4 sub-tile)
+                    if(subrow % 2 == 1)
+                      // odd
+                      offset += 128 + (subcol/4)*16 + (subcol%4) + (((subrow%8)-1)*2);
+                    else
+                      // even
+                      offset += 0   + (subcol/4)*16 + (subcol%4) + ((subrow%8)*2);
+
+                    out[offset] = data;
+                  }
+                }
+                break;
+								case COL_AMPERE:
+									// AMPERE FORMAT:
+									// 32*32 tiles with 8*32 subtiles. The rows are interleaved in pairs of two rows with offset of 8 between pairs of two rows:
+									// row idx (each number stands for 32 values): [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
+									// the tiles are column-major ordered, so after 1024*1024 values we process: A[32:64, 0:32]
+									if(TRANSPOSE)
+									{
+										const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
+										const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
+										//const int local_row =  warp_id; // each warp_id is one row
+										//const int block_row = base_col; // block offset for row
+										//const int local_col = warp_lane
+										//const int global_col = base_row; // block offset for col
+										if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
+										{
+											// each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+											char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
+
+											// each 32 columns we have new tile
+											// each tile has size 32*32 = 1024 elements offset
+											// for each row offset of 32 we increaes the tile first
+											// after all rows are exhausted, we increase the col
+											int row_offset = ((base_col+jrow+subrow_loop_row+warp_id)/32)*1024; // global_row+jrow+subrow_loop_row+local_row, increase tile(=256) every 8 rows
+
+											// we increase by row_tile_column every 32 columns
+											// base_row increase in increments of 32
+											//int row_tile_column = 1024*outRows/32; // there are outRows/32 row tiles, and each tile is 1024 elements
+											//int col_offset = (base_row/32)*row_tile_column;
+											// -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
+											// 1024*outRows/32*base_row/32 = outRows*base_row
+											int col_offset = outRows*base_row;
+
+											offset = row_offset+col_offset;
+
+
+											// same as in the non-transpose case (see below)
+											// the difference is that now rows = cols
+											// in this case warp_id = subrow
+
+											// [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
+											// subrow % 8 -> [0,1] in tile0, [2, 3] in tile 1 etc
+											// subrow % 2 -> 0 for 1st row in the pair, 1 for the 2nd row
+											// every 2 rows, the offset increases by two [0, 1, 8, 9...]
+											// every 2 rows, the row index increase by 8 [0, 1, 8, 9...]
+											int local_row = (jrow + warp_id) % 32; // offset for row > 32 is already calculated into row_offset
+											int ampere_row = ((local_row % 8)/2)*8 + (local_row/8)*2 + (local_row % 2);
+
+											// global offset + row with 32 cols each + 32 cols per j + col_idx=warp_lane
+											out[offset + (ampere_row*32) + warp_lane] = data;
+										}
+									}
+									else
+									{
+										if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
+										{
+											char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
+
+											// set offset designates the tile offset among the 32*32 tiles
+											// we first increase rows and then columns. Since we load 128 columns at once
+											// we increase the offset by outRows*32 every 32 columns
+											// additionally, we increase the offset by 32*32=1024 every 32 rows
+											offset = ((base_col+(j*32))/32)*outRows*32 + (((base_row+subrow)/32)*1024); // global offset (32x32 tile)
+
+											// [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
+											// subrow % 8 -> [0,1] in tile0, [2, 3] in tile 1 etc
+											// subrow % 2 -> 0 for 1st row in the pair, 1 for the 2nd row
+											// every 2 rows, the offset increases by two [0, 1, 8, 9...]
+											// every 2 rows, the row index increase by 8 [0, 1, 8, 9...]
+											int local_row = ((subrow % 8)/2)*8 + (subrow/8)*2 + (subrow % 2);
+
+											// global offset + row with 32 cols each + 32 cols per j + col_idx
+											out[offset + (local_row*32) + warp_lane] = data;
+										}
+									}
+								break;
+          }
+        }
+      }
+    }
+  }
+}
+
+#define DENORM 1.0f/127.0f
+#define MAX_SPARSE_COUNT 32
+#define SMEM_SIZE 8*256
+template <typename T, int SPMM_ITEMS, int BITS>
+__global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB)
+{
+
+  // 0. load balancing: We process rows with most columns first (count_vec)and we process one row per block
+  //    If a block finishes, the next one is scheduled. Since the last blocks like have fewer
+  //    elements they finish faster "fillin up" the gaps left by larger blocks
+
+  // without tensor cores
+  // 1. use rowidx_length to find what to load (as many blocks as there are rows)
+  // 2. Load A into registers
+  // 3. each warp loads all required rows of B but each warp is offset by k
+  // 4. Do mma operations that accumulate into registers
+  // 5. Each warp stores its output row into matrix C
+
+  const int count = max_count[blockIdx.x];
+  const int local_max_idx = max_idx[blockIdx.x];
+  const int offset = local_max_idx == 0 ? 0 : offset_rowidx[local_max_idx-1];
+  const int local_row_idx = rowidx[offset];
+
+  const int warp_id = threadIdx.x / 32;
+  const int warp_idx = threadIdx.x % 32;
+  const int warp_offset = (warp_id*32)*SPMM_ITEMS;
+  const int num_items = BITS == 8 ? 8 : 8;
+  int idx_col_B = warp_offset;
+  int local_idx_col_B_offset = 0;
+
+  half local_valA[MAX_SPARSE_COUNT];
+  int local_colidxA[MAX_SPARSE_COUNT];
+  half local_valC[SPMM_ITEMS];
+  T local_valsB[num_items];
+  half local_valOut[num_items];
+  // 128 byte loads per warp == 4 bytes per thread
+
+  // 2. Load A into registers
+  for(int j = 0; j < MAX_SPARSE_COUNT; j++)
+  {
+    local_valA[j] = j < count ? values[offset+j] : __float2half(0.0f);
+    local_colidxA[j] = j < count ? colidx[offset+j] : 0;
+  }
+
+  // each thread processes SPMM_ITEMS=32 per iteration. We have 256 threads. 32*256=x192
+  // we expect each warp to be SPMM_ITEMS*32 apart
+  // we have a total of 128 bytes for the bank with a bank size of 4 bytes
+  // added 3 bytes = 6 values between warps should reduce bank conflicts
+  __shared__ half smem_dequant_stats[SMEM_SIZE];
+
+
+  while(idx_col_B <  colsB)
+  {
+
+    if(dequant_stats != NULL)
+    {
+      for(int i = threadIdx.x; i < SMEM_SIZE; i+=blockDim.x)
+        if((idx_col_B+i-local_idx_col_B_offset) < colsB)
+          smem_dequant_stats[i] = dequant_stats[idx_col_B+i-local_idx_col_B_offset];
+
+      __syncthreads();
+    }
+
+    #pragma unroll SPMM_ITEMS
+    for(int j = 0; j < SPMM_ITEMS; j++)
+      local_valC[j] = 0.0f;
+
+    #pragma unroll
+    for(int i = 0; i < count; i++)
+    {
+        // 3. each warp loads all required rows of B but each warp is offset by k
+        int row_offset = colsB*local_colidxA[i];
+
+        #pragma unroll SPMM_ITEMS
+        for(int j = 0; j < SPMM_ITEMS; j+=num_items)
+        {
+          // 4. Multiply the tile -> accumulate outputs in shared memory until 128 bytes it reached
+          int idx = idx_col_B + (warp_idx*SPMM_ITEMS) + j;
+          if(idx >= colsB){ break; }
+          if((idx+num_items < colsB))
+          {
+            if(BITS == 8)
+              reinterpret_cast<float2(&)[num_items]>(local_valsB)[0] = reinterpret_cast<float2*>(B)[(row_offset+ idx)/num_items];
+            else
+              reinterpret_cast<float4(&)[num_items]>(local_valsB)[0] = reinterpret_cast<float4*>(B)[(row_offset+ idx)/num_items];
+          }
+          else
+          {
+            #pragma unroll num_items
+            for(int k = 0; k < num_items; k++)
+              if(idx+k < colsB)
+                local_valsB[k] = B[row_offset+idx+k];
+              else
+                local_valsB[k] = 0.0f;
+          }
+          #pragma unroll num_items
+          for(int k = 0; k < num_items; k++)
+          {
+            if(BITS == 8 && dequant_stats != NULL)
+              // we do texture cache reads (__ldg) on dequant_stats which should be super fast
+            {
+              float valB = local_valsB[k];
+              float valA = local_valA[i];
+              if(valB != 0.0 && valA != 0.0)
+                local_valC[j+k] = (float)local_valC[j+k] + ((float)smem_dequant_stats[idx+k-local_idx_col_B_offset])*DENORM*valB*valA;
+            }
+            else
+              local_valC[j+k] = (float)local_valC[j+k] + (float)local_valsB[k]*(float)local_valA[i];
+          }
+        }
+    }
+
+    int idx_row_C = (colsB*local_row_idx);
+
+    #pragma unroll SPMM_ITEMS
+    for(int j = 0; j < SPMM_ITEMS; j+=num_items)
+    {
+      //int idx_col_C =  idx_col_B + (32*j) + warp_idx;
+      int idx_col_C =  idx_col_B + warp_idx*SPMM_ITEMS + j;
+      int idx_val = idx_col_C + idx_row_C;
+
+      if(idx_col_C +num_items < colsB)
+      {
+
+          // load outputs to do inplace addition
+          reinterpret_cast<float4(&)[num_items/4]>(local_valOut)[0] = reinterpret_cast<float4*>(out)[idx_val/num_items];
+
+          #pragma unroll num_items
+          for(int k = 0; k < num_items; k++)
+            local_valC[(j/num_items) + k] = (float)local_valC[(j/num_items) + k] + (float)local_valOut[k];
+
+          reinterpret_cast<float4*>(out)[idx_val/num_items] = reinterpret_cast<float4(&)[num_items]>(local_valC)[j/num_items];
+      }
+      else
+      {
+        #pragma unroll num_items
+        for(int k = 0; k < num_items; k++)
+         if(idx_col_C + k < colsB)
+           out[idx_val+k] = (float)out[idx_val+k]+(float)local_valC[j+k];
+      }
+    }
+
+    idx_col_B += blockDim.x*SPMM_ITEMS;
+    local_idx_col_B_offset += blockDim.x*SPMM_ITEMS;
+  }
+}
+
+template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA)
+{
+	int local_colidx = idx[blockIdx.x];
+
+	if(FORMAT==COL_TURING)
+	{
+		// TURING FORMAT:
+		// 8*32 tiles with 4*4 subtiles
+		// the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*8 = 128 elements)
+		// the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
+		// the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
+		// the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
+		// index increases by 32
+		// columns are grouped in increments of 4, meaning that one has the following rows and columns
+		// rows: [0 0 0 0, 2 2 2 2, 4 4 4 4, 6 6 6 6, 0 0 0 0 ...]
+		// cols: [0 1 2 3, 0 1 2 4, 0 1 2 3, 0 1 2 3, 4 5 6 7 ...]
+
+		// each thread reads 1 element = 1 row
+		for(int row = threadIdx.x; row < rowsA; row+= blockDim.x)
+		{
+			int offset_per_col_tile = ((rowsA+7)/8)*32*8;
+			int tile_offset_rows = (row/8)*32*8;
+			int tile_offset_cols = (local_colidx/32)*offset_per_col_tile;
+			int offset = 0;
+			int subtile_col_idx = local_colidx%32;
+			int subtile_row_idx = row % 8;
+			if(row % 2 == 1)
+				offset += 128 + (subtile_col_idx/4)*16 + (subtile_col_idx%4) + ((subtile_row_idx-1)*2);
+			else
+				// even
+				offset += 0   + (subtile_col_idx/4)*16 + (subtile_col_idx%4) + (subtile_row_idx*2);
+
+			offset += tile_offset_rows + tile_offset_cols;
+
+			char val = A[offset];
+
+			int out_idx = (row*idx_size) + blockIdx.x;
+			out[out_idx] = val;
+		}
+	}
+	else if(FORMAT == COL_AMPERE)
+	{
+
+		for(int row = threadIdx.x; row < rowsA; row+= blockDim.x)
+		{
+			// we got 32x32 tiles and we use the magic equation from the cublasLt doc to get the element
+			// within each tile.
+			int offset_per_col_tile = ((rowsA+31)/32)*32*32;
+			int tile_offset_rows = (row/32)*32*32;
+			int tile_offset_cols = (local_colidx/32)*offset_per_col_tile;
+			int subtile_col_idx = local_colidx%32;
+			int subtile_row_idx = row % 32;
+			// this magic is taken from the cublasLt doc (search for COL32)
+			int offset = (((subtile_row_idx%8)/2*4+subtile_row_idx/8)*2+subtile_row_idx%2)*32+subtile_col_idx;
+			offset += tile_offset_cols + tile_offset_rows;
+
+			char val = A[offset];
+			int out_idx = (row*idx_size) + blockIdx.x;
+			out[out_idx] = val;
+		}
+	}
+}
+
+
+//template <int QUANT_TYPE, typename INPT, typename COMPT, typename OUTT> __global__ void kMatmul_inference_4bit(INPT *A, unsigned char *B, OUTT *out, int lda, int ldb, int rowsA, int colsA, int colsB)
+//{
+//// element-wise kernel
+//// 1. Load batch x k into registers
+//// 2. Load k x k into registers
+//// 3. dequantize and store in second pair of k x k
+//// 4. matmul
+//// 5. sum with cub
+//// 6. store outputs
+//// TC kernel
+//// use k warps per thread block
+//// 1. threadblock use read-only cache to read in register tile for A into shared memory
+//// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
+//// 3. each warp reads a segment of values 16x32 from B 
+//// 4. do dequantization from register of B into second pair of registers
+//// 5. store (4) into fragment
+//// 6. matmul aggregate into fragment C
+//// 7. aggreecate files of C into shared memroy block C
+//// 8. sum (7)
+//// 9. write outputs to matmul output matrix
+//}
+
+template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_load(T *local, T * __restrict__ const buffer, int idx, int limit_base, int limit, float zero_value = 0.0f)
+{
+    if(limit_base + ITEMS <= limit)
+      reinterpret_cast<TCAST*>(local)[0] = reinterpret_cast<TCAST*>(buffer)[idx/ITEMS];
+    else
+    {
+      for(int k = 0; k < ITEMS; k++)
+      {
+        if(limit_base + k < limit)
+          local[k] = buffer[idx+k];
+        else
+          local[k] = (T)zero_value;
+      }
+    }
+}
+
+#define WARPS 5
+template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc)
+{
+
+#if __CUDA_ARCH__ >= 750
+	using namespace nvcuda;
+  int col_offset = blockIdx.x *32;
+  const int warp_id = threadIdx.x / 32;
+  const int half_warp_id = threadIdx.x / 16;
+  const int half_warp_lane = threadIdx.x % 16;
+  const int batch_size_warps = (WARPS-1)*2;
+  const int val_per_iter = blockDim.x-32;
+
+  T local_A[4];
+  T local_B[128];
+
+  const int a_tile_offset = 16;
+  const int b_tile_offset = (16*32 + 16);
+
+  __shared__ T smem_A[8*16 + (2*16*(batch_size_warps-1))];
+  __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
+  //__shared__ T smem_C[8*32];
+
+   wmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
+   wmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
+   wmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   wmma::fill_fragment(c_frag, 0.0f);
+
+  int ticktock = 0;
+  int idx = 0 + threadIdx.x;
+  int loaded_values = 0;
+  // prefetch
+  if(idx < K && warp_id < (WARPS-1))
+  {
+    if(loaded_values == 0)
+    {
+      local_A[0] = A[idx];
+      local_A[1] = A[idx+(1*val_per_iter)];
+      local_A[2] = A[idx+(2*val_per_iter)];
+      local_A[3] = A[idx+(3*val_per_iter)];
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+      {
+        local_B[col] = B[(col_offset+col)*ldb+idx];
+        local_B[col+32] = B[(col_offset+col)*ldb+idx+(1*val_per_iter)];
+        local_B[col+64] = B[(col_offset+col)*ldb+idx+(2*val_per_iter)];
+        local_B[col+96] = B[(col_offset+col)*ldb+idx+(3*val_per_iter)];
+      }
+      loaded_values = 3;
+    }
+    else
+    {
+
+      if(loaded_values == 3)
+      {
+        local_A[0] = local_A[1];
+        #pragma unroll 32
+        for(int col = 0; col < 32; col++)
+          local_B[col] = local_B[col+(32)];
+      }
+      else if(loaded_values == 2)
+      {
+        local_A[0] = local_A[2];
+        #pragma unroll 32
+        for(int col = 0; col < 32; col++)
+          local_B[col] = local_B[col+(64)];
+      }
+      else
+      {
+        local_A[0] = local_A[3];
+        #pragma unroll 32
+        for(int col = 0; col < 32; col++)
+          local_B[col] = local_B[col+(96)];
+      }
+      loaded_values--;
+    }
+
+    smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0];
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+        smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col];
+  }
+  else if(warp_id < (WARPS-1))
+  {
+    local_A[0] = T(0.0);
+    smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] =  0.0f;
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+      local_B[col] = 0.0f;
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+      smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f;
+  }
+  ticktock = ticktock == 0 ? 1 : 0;
+
+  //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
+  for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
+  {
+    idx = base_idx + threadIdx.x;
+
+    __syncthreads();
+    if(idx < K && warp_id < (WARPS-1))
+    {
+      //local_A[0] = A[idx];
+
+      //#pragma unroll 32
+      //for(int col = 0; col < 32; col++)
+      //  local_B[col] = B[(col_offset+col)*ldb+idx];
+      if(loaded_values == 0)
+      {
+        local_A[0] = A[idx];
+        local_A[1] = A[idx+(1*val_per_iter)];
+        local_A[2] = A[idx+(2*val_per_iter)];
+        local_A[3] = A[idx+(3*val_per_iter)];
+
+        #pragma unroll 32
+        for(int col = 0; col < 32; col++)
+        {
+          local_B[col] = B[(col_offset+col)*ldb+idx];
+          local_B[col+32] = B[(col_offset+col)*ldb+idx+(1*val_per_iter)];
+          local_B[col+64] = B[(col_offset+col)*ldb+idx+(2*val_per_iter)];
+          local_B[col+96] = B[(col_offset+col)*ldb+idx+(3*val_per_iter)];
+        }
+        loaded_values = 3;
+
+      }
+      else
+      {
+
+        if(loaded_values == 3)
+        {
+          local_A[0] = local_A[1];
+          #pragma unroll 32
+          for(int col = 0; col < 32; col++)
+            local_B[col] = local_B[col+(32)];
+        }
+        else if(loaded_values == 2)
+        {
+          local_A[0] = local_A[2];
+          #pragma unroll 32
+          for(int col = 0; col < 32; col++)
+            local_B[col] = local_B[col+(64)];
+        }
+        else
+        {
+          local_A[0] = local_A[3];
+          #pragma unroll 32
+          for(int col = 0; col < 32; col++)
+            local_B[col] = local_B[col+(96)];
+        }
+        loaded_values--;
+      }
+
+      smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0];
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+          smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col];
+    }
+    else if(warp_id < (WARPS-1))
+    {
+      local_A[0] = T(0.0);
+      smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] =  0.0f;
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+        local_B[col] = 0.0f;
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+        smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f;
+    }
+    ticktock = ticktock == 0 ? 1 : 0;
+
+    if(warp_id == (WARPS-1))
+      for(int k = 0; k < batch_size_warps; k++)
+      {
+        wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+        wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      }
+  }
+
+  __syncthreads();
+  if(warp_id != (WARPS-1)){ return; }
+  // only warp_id == (WARPS-1) from here
+  int warp_lane = threadIdx.x % 32;
+
+  ticktock = ticktock == 0 ? 1 : 0;
+  for(int k = 0; k < batch_size_warps; k++)
+  {
+    wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+    wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+  }
+
+  // 129 mu
+  if(warp_id == (WARPS-1))
+    wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
+
+  if(col_offset + warp_lane < M)
+    out[col_offset + warp_lane] = smem_A[warp_lane];
+#endif
+}
+
+template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
+{
+
+#if __CUDA_ARCH__ >= 750
+	using namespace nvcuda;
+  int col_offset = blockIdx.x *32;
+  const int warp_id = threadIdx.x / 32;
+  const int half_warp_id = threadIdx.x / 16;
+  const int half_warp_lane = threadIdx.x % 16;
+  const int batch_size_warps = (WARPS-1)*2;
+
+  T local_A[2];
+  T local_B[64];
+  unsigned char local_B_4bit[32];
+
+  const int a_tile_offset = 16;
+  const int b_tile_offset = (16*32 + 16);
+
+  __shared__ T smem_A[8*16 + (2*16*(batch_size_warps-1))];
+  __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
+  //__shared__ T smem_C[8*32];
+
+   wmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
+   wmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
+   wmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   wmma::fill_fragment(c_frag, 0.0f);
+
+  int ticktock = 0;
+  int idx = 0 + threadIdx.x;
+  int loaded_values = 0;
+  // prefetch
+  if(idx < K && warp_id < (WARPS-1))
+  {
+    if(loaded_values == 0)
+    {
+      local_A[0] = A[idx];
+      local_A[1] = A[idx+blockDim.x-32];
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+        local_B_4bit[col] = B[(col_offset+col)*ldb+idx];
+
+      loaded_values = 1;
+    }
+    else
+    {
+      local_A[0] = local_A[1];
+      loaded_values--;
+
+      #pragma unroll 64
+      for(int col = 0; col < 64; col+=2)
+      {
+        local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(1.0f);
+        local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(1.0f);
+      }
+    }
+
+    smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0];
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+        smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col];
+  }
+  else if(warp_id < (WARPS-1))
+  {
+    local_A[0] = T(0.0);
+    smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] =  0.0f;
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+      local_B[col] = 0.0f;
+
+    #pragma unroll 32
+    for(int col = 0; col < 32; col++)
+      smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f;
+  }
+  ticktock = ticktock == 0 ? 1 : 0;
+
+  //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
+  for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
+  {
+    idx = base_idx + threadIdx.x;
+
+    __syncthreads();
+    if(idx < K && warp_id < (WARPS-1))
+    {
+      if(loaded_values == 0)
+      {
+        local_A[0] = A[idx];
+        local_A[1] = A[idx+blockDim.x-32];
+
+        #pragma unroll 32
+        for(int col = 0; col < 32; col++)
+        {
+          local_B_4bit[col] = B[(col_offset+col)*ldb+idx];
+          local_B_4bit[col+16] = B[(col_offset+col)*ldb+idx];
+        }
+
+        loaded_values = 1;
+      }
+      else
+      {
+        local_A[0] = local_A[1];
+        loaded_values--;
+
+        int absidx = (idx + col_offset)/blocksize;
+        half local_absmax = __ldg(&(absmax[absidx]));
+
+        #pragma unroll 64
+        for(int col = 0; col < 64; col+=2)
+        {
+          local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(absidx);
+          local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(absidx);
+        }
+      }
+
+      smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0];
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+          smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = local_B[col];
+    }
+    else if(warp_id < (WARPS-1))
+    {
+      local_A[0] = T(0.0);
+      smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] =  0.0f;
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+        local_B[col] = 0.0f;
+
+      #pragma unroll 32
+      for(int col = 0; col < 32; col++)
+        smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f;
+    }
+    ticktock = ticktock == 0 ? 1 : 0;
+
+    if(warp_id == (WARPS-1))
+      for(int k = 0; k < batch_size_warps; k++)
+      {
+        wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+        wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+      }
+  }
+
+  __syncthreads();
+  if(warp_id != (WARPS-1)){ return; }
+  // only warp_id == (WARPS-1) from here
+  int warp_lane = threadIdx.x % 32;
+
+  ticktock = ticktock == 0 ? 1 : 0;
+  for(int k = 0; k < batch_size_warps; k++)
+  {
+    wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+    wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+  }
+
+  // 129 mu
+  if(warp_id == (WARPS-1))
+    wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
+
+  if(col_offset + warp_lane < M)
+    out[col_offset + warp_lane] = smem_A[warp_lane];
+#endif
+}
+
+//#define ROWS 2
+//template <typename T, int ITEMS, int THREADS> __global__ void gemm_device(int M, int N, int K, T const* A,  T* B,  T * out,  int lda, int ldb, int ldc)
+//{
+//// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp
+//// 1. Load dataB into register
+//// 2. Dequantize B
+//// 3. Fetch data from A and multiply
+//
+//  typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
+//  //__shared__ typename LoadA::TempStorage loada;
+//  typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadB;
+//  //__shared__ typename LoadB::TempStorage loadb;
+//  typedef hipcub::BlockReduce<T, THREADS> BlockReduce;
+//  // Allocate shared memory for BlockReduce
+//  //__shared__ typename BlockReduce::TempStorage reduce;
+//
+//  __shared__ union {
+//    typename BlockReduce::TempStorage reduce;
+//    typename LoadB::TempStorage loadb;
+//    typename LoadA::TempStorage loada;
+//  } temp_storage;
+//
+//
+//	T dataA[ITEMS];
+//  T local_B[ITEMS];
+//  T local_accC[ROWS];
+//	int valid_items = 0;
+//  const int col_offset = blockIdx.x * 8;
+//
+//	__shared__ T tileA[ROWS*THREADS*ITEMS];
+//	__shared__ T accumulatorC[ROWS*8];
+//
+//  //#pragma unroll 8
+//  //for(int i = 0; i < 8; i++)
+//  //  tileA[threadIdx.x + (i*256)] = 0.0f;
+//  //__syncthreads();
+//  if(threadIdx.x < 64)
+//    accumulatorC[threadIdx.x] = 0.0f;
+//  __syncthreads();
+//
+//
+//	for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS)
+//	{
+//		valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
+//		int baserow = 0;
+//		for(int row = baserow; row < (baserow+ROWS) && row < N; row++)
+//		{
+//			LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f);
+//
+//      #pragma unroll ITEMS
+//      for(int k = 0; k < ITEMS; k++)
+//          tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k];
+//
+//		__syncthreads();
+//		}
+//		baserow += ROWS;
+//
+//    // load 16 columns from B at a time. B is transposed, so its like loading rows
+//    // each warp loads one row
+//    // each thread loads 128 byte
+//
+//    // col: inner_idx + warp_lane
+//    // row: ldb*(offset + warp_id)
+//    for(int col = 0; col < 8 && (col_offset + col) < M; col++)
+//    {
+//      int colB = col_offset + col;
+//
+//      for(int k = 0; k < ROWS; k++)
+//        local_accC[k] = 0.0f;
+//
+//      int base_idxB = ldb*colB;
+//      valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
+//      LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f);
+//      __syncthreads();
+//
+//      for(int row = 0; row < ROWS && row < N; row++)
+//      {
+//        #pragma unroll ITEMS
+//        for(int k = 0; k < ITEMS; k++)
+//        {
+//          int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k);
+//          local_accC[row] += tileA[idxA]*local_B[k];
+//        }
+//
+//        local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], hipcub::Sum());
+//        if(threadIdx.x == 0)
+//          atomicAdd(&accumulatorC[row*8 + col], local_accC[row]);
+//      }
+//    }
+//	}
+//
+//  for(int row = 0; row < ROWS && row < N; row++)
+//  {
+//    int out_idx = ldc*row + col_offset;
+//
+//    //if(threadIdx.x < 8)
+//    //  if(accumulatorC[row*8 + threadIdx.x] != 0.0)
+//    //    printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x);
+//
+//    if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M)
+//    {
+//      //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx);
+//      out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x];
+//    }
+//  }
+//
+//
+//
+//}
+
+
+template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n)
+{
+  for(long i = (blockDim.x*blockIdx.x) + threadIdx.x; i < n; i+=(blockDim.x*gridDim.x))
+  {
+    switch(FUNC)
+    {
+      case FILL: 
+        A[i] = (T)value;
+        break;
+      case ARANGE:
+        A[i] = (T)i;
+        break;
+      case _MUL:
+        A[i] = A[i]*B[i];
+        break;
+    }
+  }
+}
+
+
+//==============================================================
+//                   TEMPLATE DEFINITIONS
+//==============================================================
+
+template __global__ void kfunc<float, FILL>(float *A, float *B, float value, long n);
+template __global__ void kfunc<unsigned char, FILL>(unsigned char *A, unsigned char *B, unsigned char value, long n);
+template __global__ void kfunc<float, ARANGE>(float *A, float *B, float value, long n);
+template __global__ void kfunc<float, _MUL>(float *A, float *B, float value, long n);
+
+// these are not used and make no sense, but the compiler needs them
+//template __global__ void gemm_device<float, 16, 128>(int M, int N, int K, float * __restrict__ const A,  float* B,  float * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 256>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 192>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 160>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 128>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+//template __global__ void gemm_device<float, 16, 32>(int M, int N, int K, float * __restrict__ const A,  float* B,  float * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 32>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 64>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 32, 96>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+// these are not used and make no sense, but the compiler needs them
+
+//template __global__ void gemm_device<float, 32, 128>(int M, int N, int K, float * __restrict__ const A,  float* B,  float * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 256>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 192>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 160>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 128>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+//template __global__ void gemm_device<float, 32, 32>(int M, int N, int K, float * __restrict__ const A,  float* B,  float * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 32>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 64>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+template __global__ void gemm_device<half, 16, 96>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
+
+template __global__ void kgemm_4bit_inference<half, 128>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+template __global__ void kgemm_4bit_inference<half, 160>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+
+template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
+template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
+
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL_TURING>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_TURING>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+
+template __global__ void kdequant_mm_int32_fp16<4, 128, 512>(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
+
+template __global__ void kDoubleRowColQuant<64, 4, 16, 64*4, 0>(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
+template __global__ void kDoubleRowColQuant<64, 4, 16, 64*4, 1>(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
+
+template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x);
+template __device__ unsigned char dQuantize<1>(float* smem_code, const float rand, float x);
+
+template __global__ void kEstimateQuantiles(float *__restrict__ const A, float *code, const float offset, const float max_val, const int n);
+template __global__ void kEstimateQuantiles(half *__restrict__ const A, float *code, const float offset, const half max_val, const int n);
+
+#define MAKE_PreconditionOptimizer32bit1State(oname, gtype) \
+template __global__ void kPreconditionOptimizer32bit1State<gtype, oname, 4096, 8>(gtype* g, gtype* p, \
+                float* state1, float *unorm, \
+                const float beta1, const float beta2, const float eps, const float weight_decay, \
+                const int step, const float lr, const float gnorm_scale, const int n); \
+
+MAKE_PreconditionOptimizer32bit1State(MOMENTUM, half)
+MAKE_PreconditionOptimizer32bit1State(MOMENTUM, float)
+MAKE_PreconditionOptimizer32bit1State(RMSPROP, half)
+MAKE_PreconditionOptimizer32bit1State(RMSPROP, float)
+MAKE_PreconditionOptimizer32bit1State(LION, half)
+MAKE_PreconditionOptimizer32bit1State(LION, float)
+MAKE_PreconditionOptimizer32bit1State(LION, __nv_bfloat16)
+MAKE_PreconditionOptimizer32bit1State(ADAGRAD, half)
+MAKE_PreconditionOptimizer32bit1State(ADAGRAD, float)
+
+#define MAKE_Optimizer32bit1State(oname, gtype) \
+template __global__ void kOptimizer32bit1State<gtype, oname>(gtype* g, gtype* p, float* state1, float *unorm, const float max_unorm, const float param_norm, \
+    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); \
+
+MAKE_Optimizer32bit1State(MOMENTUM, half)
+MAKE_Optimizer32bit1State(MOMENTUM, float)
+MAKE_Optimizer32bit1State(RMSPROP, half)
+MAKE_Optimizer32bit1State(RMSPROP, float)
+MAKE_Optimizer32bit1State(LION, half)
+MAKE_Optimizer32bit1State(LION, float)
+MAKE_Optimizer32bit1State(LION, __nv_bfloat16)
+MAKE_Optimizer32bit1State(ADAGRAD, half)
+MAKE_Optimizer32bit1State(ADAGRAD, float)
+
+#define MAKE_PreconditionOptimizer32bit2State(oname, gtype) \
+template __global__ void kPreconditionOptimizer32bit2State<gtype, oname, 4096, 8>(gtype* g, gtype* p,  \
+                float* state1, float* state2, float *unorm, \
+                const float beta1, const float beta2, const float eps, const float weight_decay, \
+                const int step, const float lr, const float gnorm_scale, const int n); \
+
+MAKE_PreconditionOptimizer32bit2State(ADAM, float)
+MAKE_PreconditionOptimizer32bit2State(ADAM, half)
+MAKE_PreconditionOptimizer32bit2State(ADAM, __nv_bfloat16)
+
+template __global__ void kOptimizer32bit2State<float, ADAM>(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+template __global__ void kOptimizer32bit2State<half, ADAM>(half* g, half* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+template __global__ void kOptimizer32bit2State<__nv_bfloat16, ADAM>(__nv_bfloat16* g, __nv_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+
+#define MAKE_PreconditionStatic8bit1State(oname, gtype) \
+template __global__ void kPreconditionOptimizerStatic8bit1State<gtype, oname>(gtype* p, gtype* __restrict__ const g, unsigned char*__restrict__  const state1,  \
+                float *unorm,  \
+                const float beta1,  \
+                const float beta2,  \
+                const float eps, const int step,  \
+                float* __restrict__ const quantiles1,  \
+                float* max1, float* new_max1,  \
+                const float weight_decay, \
+                const float gnorm_scale,  \
+                const int n); \
+
+MAKE_PreconditionStatic8bit1State(MOMENTUM, half)
+MAKE_PreconditionStatic8bit1State(MOMENTUM, float)
+MAKE_PreconditionStatic8bit1State(RMSPROP, half)
+MAKE_PreconditionStatic8bit1State(RMSPROP, float)
+MAKE_PreconditionStatic8bit1State(LION, half)
+MAKE_PreconditionStatic8bit1State(LION, float)
+
+#define MAKE_optimizerStatic8bit1State(oname, gtype) \
+template __global__ void kOptimizerStatic8bit1State<gtype, oname>(gtype* p, gtype* const g, unsigned char* state1,  \
+                const float *unorm, const float max_unorm, const float param_norm, \
+                const float beta1,  \
+                const float beta2,  \
+                const float eps, const int step, const float lr, \
+                float* __restrict__ const quantiles1,  \
+                float* max1, float* new_max1,  \
+                float weight_decay, \
+                const float gnorm_scale,  \
+                const int n); \
+
+MAKE_optimizerStatic8bit1State(MOMENTUM, half)
+MAKE_optimizerStatic8bit1State(MOMENTUM, float)
+MAKE_optimizerStatic8bit1State(RMSPROP, half)
+MAKE_optimizerStatic8bit1State(RMSPROP, float)
+MAKE_optimizerStatic8bit1State(LION, half)
+MAKE_optimizerStatic8bit1State(LION, float)
+
+#define MAKE_PreconditionStatic8bit2State(oname, gtype) \
+template __global__ void kPreconditionOptimizerStatic8bit2State<gtype, oname>(gtype* p, gtype* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2, \
+                float *unorm, \
+                const float beta1, const float beta2, \
+                const float eps, const int step,  \
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                const float gnorm_scale,  \
+                const int n); \
+
+MAKE_PreconditionStatic8bit2State(ADAM, half)
+MAKE_PreconditionStatic8bit2State(ADAM, float)
+
+#define MAKE_optimizerStatic8bit2State(oname, gtype) \
+template __global__ void kOptimizerStatic8bit2State<gtype, oname>(gtype* p, gtype* const g, unsigned char* state1, unsigned char* state2, \
+                const float *unorm, const float max_unorm, const float param_norm, \
+                const float beta1, const float beta2, \
+                const float eps, const int step, const float lr, \
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, \
+                const float gnorm_scale,  \
+                const int n); \
+
+MAKE_optimizerStatic8bit2State(ADAM, half)
+MAKE_optimizerStatic8bit2State(ADAM, float)
+
+template __global__ void kPercentileClipping<float, 2048, 4>(float * __restrict__ g, float *gnorm_vec, int step, const int n);
+template __global__ void kPercentileClipping<half, 2048, 4>(half * __restrict__ g, float *gnorm_vec, int step, const int n);
+
+#define MAKE_kQuantizeBlockwise(dtype, blocksize, num_per_thread, stochastic, data_type_name) \
+template __global__ void kQuantizeBlockwise<dtype, blocksize, num_per_thread, stochastic, data_type_name>(float * code, dtype * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); \
+
+MAKE_kQuantizeBlockwise(half,  4096, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,  4096, 4, 1, General8bit)
+MAKE_kQuantizeBlockwise(half,  2048, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,  1024, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,   512, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,   256, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,   128, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,    64, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit)
+MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  512, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  256, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  128, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(half,  4096, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(half,  2048, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(half,  1024, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(half,   512, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(half,   256, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(half,   128, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(half,    64, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  512, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  256, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  128, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(half,  4096, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(half,  2048, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(half,  1024, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(half,   512, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(half,   256, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(half,   128, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(half,    64, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(float,  512, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(float,  256, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(float,  128, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(float,   64, 2, 0, NF4)
+
+template __global__ void kDequantizeBlockwise<half, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<float, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<half, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<float, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<half, 512, 64, 8, NF4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<float, 512, 64, 8, NF4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
+
+#define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread) \
+template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block_size, num_per_thread>(gtype* p, gtype* __restrict__ const g, unsigned char* state1, unsigned char* state2, \
+                const float beta1, const float beta2, \
+                const float eps, const int step, const float lr, \
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, \
+                float* absmax1, float* absmax2,  \
+                float weight_decay, \
+                const float gnorm_scale, const bool skip_zeros, const int n); \
+
+MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, float, 2048, 8)
+MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, half, 2048, 8)
+MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, __nv_bfloat16, 2048, 8)
+
+
+#define MAKE_OptimizerStatic8bit1StateBlockwise(oname, gtype, block_size, num_per_thread) \
+template __global__ void kOptimizerStatic8bit1StateBlockwise<gtype, oname, block_size, num_per_thread>( \
+		gtype* p, gtype* __restrict__ const g, unsigned char* state1, \
+                const float beta1, const float beta2, \
+                const float eps, const int step, const float lr, \
+                float* __restrict__ const quantiles1, \
+                float* absmax1, \
+                float weight_decay, \
+                const float gnorm_scale, const bool skip_zeros, const int n); \
+
+MAKE_OptimizerStatic8bit1StateBlockwise(MOMENTUM, float, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(MOMENTUM, half, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, float, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, half, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(LION, float, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(LION, half, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(LION, __nv_bfloat16, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, float, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, half, 2048, 8)
diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh
new file mode 100644
index 000000000..90a8cf6e9
--- /dev/null
+++ b/csrc/kernels_hip.cuh
@@ -0,0 +1,132 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <float.h>
+#include <ops.cuh>
+
+#ifndef kernels
+#define kernels
+
+//template <int QUANT_TYPE, typename INP_TYPE, typename COMP_TYPE, typename OUT_TYPE>__global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *out, int lda, int ldb, int rowsA, int colsA, int colsB);
+
+template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);
+
+__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
+__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
+
+template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
+template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
+                float* state1, float* state2, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit2State(T* g, T* p,
+                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
+                float* state1, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit1State(T* g, T* p,
+                float* state1,  float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
+                const float weight_decay,
+                const float gnorm_scale, const int n);
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
+                float weight_decay, const float gnorm_scale, const int n);
+
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                const float gnorm_scale, const int n);
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
+                const float beta1, const float beta2, const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* absmax1,
+                float weight_decay,
+                const float gnorm_scale, const bool skip_zeros, const int n);
+
+
+template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
+
+__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
+
+
+template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+
+template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
+  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
+  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
+
+template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
+
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+
+template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
+
+template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc);
+template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+
+template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n);
+
+#endif
diff --git a/csrc/ops.hip b/csrc/ops.hip
new file mode 100644
index 000000000..3606fadc9
--- /dev/null
+++ b/csrc/ops.hip
@@ -0,0 +1,850 @@
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <ops.cuh>
+#include <kernels.cuh>
+#include <hipcub/hipcub.hpp>
+#include <limits>
+#include <BinSearch.h>
+#include <cassert>
+#include <common.h>
+
+
+using namespace BinSearch;
+using std::cout;
+using std::endl;
+
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
+{
+  int threads = 512;
+  int num_blocks = n/threads;
+  num_blocks = n % threads == 0 ? num_blocks : num_blocks + 1;
+ hipLaunchKernelGGL(( kHistogramScatterAdd2D), dim3(num_blocks), dim3(512), 0, 0, histogram, index1, index2, src, maxidx1, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
+{
+  int num_blocks = n/4096;
+  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
+	CUDA_CHECK_RETURN(hipMemset(code, 0, 256*sizeof(float)));
+ hipLaunchKernelGGL(( kEstimateQuantiles<T>), dim3(num_blocks), dim3(512), 0, 0, A, code, offset, std::numeric_limits<T>::max(), n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+void quantize(float *code, float *A, unsigned char *out, int n)
+{
+  int num_blocks = n/1024;
+  num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
+ hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+void dequantize(float *code, unsigned char *A, float *out, int n)
+{
+  int num_blocks = n/1024;
+  num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
+ hipLaunchKernelGGL(( kDequantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n)
+{
+  int num_blocks = n/blocksize;
+  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
+
+  if(blocksize == 4096)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, 0>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 2048)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(512), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 1024)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 1024, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(256), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 512)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 512, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(256), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 256)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 256, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 128)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 128, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  else if(blocksize == 64)
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+
+
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n)
+{
+  int num_blocks = n/blocksize;
+  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
+  int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
+
+  if(DATA_TYPE > 0)
+   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, 0, code, A, absmax, out, blocksize/2, n);
+  else
+   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, 0, code, A, absmax, out, blocksize, n);
+
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+
+//void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB)
+//{
+//	int num_blocks = (colsB+32-1)/32;
+//	kMatmul_inference_4bit<NF4, half, half, half><<<num_blocks, 256>>>(A, B, out, lda, ldb, rowsA, colsA, colsB);
+//  CUDA_CHECK_RETURN(hipPeekAtLastError());
+//}
+
+
+template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
+                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n)
+{
+  int num_blocks = n/4096;
+  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+      if(max_unorm > 0.0f)
+			{
+				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
+       hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
+        CUDA_CHECK_RETURN(hipPeekAtLastError());
+      }
+		hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
+      CUDA_CHECK_RETURN(hipPeekAtLastError());
+			break;
+		case MOMENTUM:
+    case RMSPROP:
+    case ADAGRAD:
+      if(max_unorm > 0.0f)
+			{
+				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
+			hipLaunchKernelGGL((	kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
+        CUDA_CHECK_RETURN(hipPeekAtLastError());
+			}
+
+		hipLaunchKernelGGL((	kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
+      CUDA_CHECK_RETURN(hipPeekAtLastError());
+			break;
+    case LION:
+      // in lion, the momentum update after the parameter update
+     hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
+      CUDA_CHECK_RETURN(hipPeekAtLastError());
+
+      if(max_unorm > 0.0f)
+      {
+        CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
+       hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
+        CUDA_CHECK_RETURN(hipPeekAtLastError());
+      }
+      break;
+	}
+}
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
+                unsigned char* state1, unsigned char* state2,
+                float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2,
+                float eps, int step, float lr,
+                float* quantiles1, float* quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay,
+                const float gnorm_scale, int n)
+{
+  int num_blocks = n/4096;
+  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
+
+  if(max_unorm > 0.0f){ CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float))); }
+
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
+			CUDA_CHECK_RETURN(hipMemset(new_max2, 0, 1*sizeof(float)));
+		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+		hipLaunchKernelGGL((	kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
+																														quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+		break;
+		case MOMENTUM:
+    case RMSPROP:
+    case ADAGRAD:
+			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
+		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+		hipLaunchKernelGGL((	kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
+																														quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+			break;
+    case LION:
+      // in lion, the momentum update happens after the parameter update
+     hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
+                                                            quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+      CUDA_CHECK_RETURN(hipPeekAtLastError());
+
+      CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
+     hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+      CUDA_CHECK_RETURN(hipPeekAtLastError());
+      break;
+		default:
+			break;
+	}
+}
+
+#define BLOCKSIZE_2STATE 2048
+#define NUM_2STATE 8
+#define BLOCKSIZE_1STATE 2048
+#define NUM_1STATE 8
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)
+{
+
+	int num_blocks = 0;
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+			num_blocks = n/BLOCKSIZE_2STATE;
+			num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
+		hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, eps, step, lr,
+																														quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+		break;
+		case MOMENTUM:
+		case RMSPROP:
+    case ADAGRAD:
+    case LION:
+			num_blocks = n/BLOCKSIZE_1STATE;
+			num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1;
+		hipLaunchKernelGGL((	kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
+																														quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
+			CUDA_CHECK_RETURN(hipPeekAtLastError());
+		break;
+	}
+}
+
+
+
+template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n)
+{
+  int num_blocks = n/2048;
+  num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1;
+	CUDA_CHECK_RETURN(hipMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
+ hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc)
+{
+  const int falpha = 1;
+  const int fbeta = 0;
+  const void * alpha = &falpha;
+  const void * beta = &fbeta;
+	rocblas_status status;
+
+			status = rocblas_gemmex(context->m_handle,
+					transposeA ? rocblas_operation_transpose : rocblas_operation_none,
+					transposeB ? rocblas_operation_transpose : rocblas_operation_none,
+					m, n,	k,
+					alpha, A, HIP_R_8I, lda, B, HIP_R_8I, ldb, beta,
+					C, HIP_R_32I, ldc,
+          HIP_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+
+    if (status != rocblas_status_success)
+    {
+      std::cout << "CUBLAS ERROR: Status " << status << std::endl;
+    }
+
+}
+
+void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
+                    long long int strideA, long long int strideB, long long int strideC, int batchCount)
+{
+  const int falpha = 1;
+  const int fbeta = 0;
+  const void * alpha = &falpha;
+  const void * beta = &fbeta;
+	rocblas_status status;
+
+  //cout << transposeA << transposeB << endl;
+  //printf("%i %i %i\n", m,n,k);
+  //printf("%i %i %i\n", lda,ldb,ldc);
+  //printf("%i %i %i\n", strideA, strideB, strideC);
+  //printf("%i\n", batchCount);
+
+			status = cublasGemmStridedBatchedEx(context->m_handle,
+					transposeA ? rocblas_operation_transpose : rocblas_operation_none,
+					transposeB ? rocblas_operation_transpose : rocblas_operation_none,
+					m, n,	k,
+					alpha, A, HIP_R_8I, lda, (long long int)strideA, B, HIP_R_8I, ldb, (long long int)strideB, beta,
+					C, HIP_R_32I, ldc, (long long int)strideC, batchCount,
+          HIP_R_32I, CUBLAS_GEMM_DEFAULT);
+
+    if (status != rocblas_status_success)
+    {
+      std::cout << "CUBLAS ERROR: Status " << status << std::endl;
+    }
+
+}
+
+int roundoff(int v, int d) {
+    return (v + d - 1) / d * d;
+}
+
+
+#ifdef NO_CUBLASLT
+#else
+template<int ORDER> cublasLtOrder_t get_order()
+{
+	switch(ORDER)
+	{
+		case ROW:
+      return CUBLASLT_ORDER_ROW;
+			break;
+    case COL:
+      return CUBLASLT_ORDER_COL;
+      break;
+    case COL32:
+      return CUBLASLT_ORDER_COL32;
+      break;
+    case COL_TURING:
+      return CUBLASLT_ORDER_COL4_4R2_8C;
+      break;
+    case COL_AMPERE:
+      return CUBLASLT_ORDER_COL32_2R_4R4;
+      break;
+		default:
+			break;
+  }
+
+	return CUBLASLT_ORDER_ROW;
+}
+
+template cublasLtOrder_t get_order<ROW>();
+template cublasLtOrder_t get_order<COL>();
+template cublasLtOrder_t get_order<COL32>();
+template cublasLtOrder_t get_order<COL_TURING>();
+template cublasLtOrder_t get_order<COL_AMPERE>();
+#endif
+
+
+template<int ORDER> int get_leading_dim(int dim1, int dim2)
+{
+	switch(ORDER)
+	{
+		case ROW:
+      return dim2;
+			break;
+    case COL:
+      return dim1;
+      break;
+    case COL32:
+      // 32*row tiles
+      return dim1*32;
+      break;
+    case COL_TURING:
+      return 32*roundoff(dim1, 8);
+      break;
+    case COL_AMPERE:
+      // 32*32 tiles
+      return 32*roundoff(dim1, 32);
+      break;
+		default:
+			return 0;
+			break;
+  }
+}
+
+template int get_leading_dim<ROW>(int dim1, int dim2);
+template int get_leading_dim<COL>(int dim1, int dim2);
+template int get_leading_dim<COL32>(int dim1, int dim2);
+
+template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2)
+{
+#ifdef NO_CUBLASLT
+#else
+  cublasLtOrder_t orderA = get_order<SRC>();
+  cublasLtOrder_t orderOut = get_order<TARGET>();
+  int ldA = get_leading_dim<SRC>(dim1, dim2);
+  int ldOut = get_leading_dim<TARGET>(dim1, dim2);
+
+  cublasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
+  cublasLtMatrixTransformDesc_t A2Out_desc = NULL;
+  rocblas_operation opTranspose = rocblas_operation_transpose;
+  float transformAlpha = 1.0f, transformBeta = 0.0f;
+
+
+  if(DTYPE == 8)
+  {
+    checkCublasStatus(cublasLtMatrixLayoutCreate(&A_desc, HIP_R_8I, dim1, dim2, ldA));
+    checkCublasStatus(cublasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim1, dim2, ldOut));
+  }
+  else if(DTYPE == 32)
+  {
+    checkCublasStatus(cublasLtMatrixLayoutCreate(&A_desc, HIP_R_32I, dim1, dim2, ldA));
+    checkCublasStatus(cublasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim1, dim2, ldOut));
+  }
+  else
+  {
+    printf("ERROR WRONG TYPE FOR TRANSFORM: %i\n", DTYPE);
+  }
+
+  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(A_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
+  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(out_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
+
+  checkCublasStatus(cublasLtMatrixTransformDescCreate(&A2Out_desc, HIP_R_32F));
+
+  if(transpose){ checkCublasStatus(cublasLtMatrixTransformDescSetAttribute(A2Out_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
+
+  checkCublasStatus(cublasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
+
+  if (A_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(A_desc));
+  if (out_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(out_desc));
+  if (A2Out_desc) checkCublasStatus(cublasLtMatrixTransformDescDestroy(A2Out_desc));
+#endif
+}
+
+template void transform<int8_t, ROW, COL, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL32, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, ROW, COL32, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL_TURING, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL_AMPERE, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, COL32, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+{
+#ifdef NO_CUBLASLT
+  cout << "" << endl;
+  cout << "=============================================" << endl;
+  cout << "ERROR: Your GPU does not support Int8 Matmul!" << endl;
+  cout << "=============================================" << endl;
+  cout << "" << endl;
+  assert(false);
+
+	return 0;
+#else
+    int has_error = 0;
+    cublasLtMatmulDesc_t matmulDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    rocblas_operation opT = rocblas_operation_transpose;
+    cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t col_turing = CUBLASLT_ORDER_COL4_4R2_8C;
+    cublasLtOrder_t col_ampere = CUBLASLT_ORDER_COL32_2R_4R4;
+
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, HIP_R_8I, m, k, lda));
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, HIP_R_8I, n, k, ldb));
+
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+    if(FORMATB == COL_TURING)
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
+    else
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
+
+    if(DTYPE_OUT == 32)
+    {
+      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, HIP_R_32I));
+      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, HIP_R_32I, m, n, ldc));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      int alpha = 1, beta = 0;
+      has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+    }
+    else
+    {
+      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, HIP_R_32F));
+      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, HIP_R_8I, m, n, ldc));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      if(!SCALE_ROWS)
+      {
+        float alpha = 1.0f, beta = 0.0f;
+        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+      }
+      else
+      {
+        has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
+        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+      }
+    }
+
+
+    if (Cdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc));
+    if (Bdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc));
+    if (Adesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc));
+    if (matmulDesc) has_error |= checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc));
+    if(has_error == 1)
+      printf("error detected");
+
+    return has_error;
+#endif
+}
+
+int fill_up_to_nearest_multiple(int value, int multiple)
+{
+  return value + (value % multiple == 0 ? 0 : (multiple - (value % multiple)));
+}
+
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half *bias, int numRows, int numCols)
+{
+  int threads = 512;
+  int tileCols = fill_up_to_nearest_multiple(numCols, 32);
+  int n = numRows*tileCols;
+  int subtile_rows = 128;
+  int tilesize = 32*subtile_rows;
+  int num_blocks = numRows/subtile_rows;
+  num_blocks += (numRows % subtile_rows == 0) ? 0 : 1;
+  num_blocks = num_blocks*(tileCols/32);
+  assert(threads <= tilesize);
+
+ hipLaunchKernelGGL(( kdequant_mm_int32_fp16<4, 128, 512>), dim3(num_blocks), dim3(threads), 0, 0, A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols, tileCols, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+#define STATS_THREADS 64
+#define STATS_ITEMS 4
+#define STATS_ROWS 16
+void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
+{
+  int tile_cols = STATS_THREADS*STATS_ITEMS;
+  int tiledCols = fill_up_to_nearest_multiple(cols, tile_cols);
+  int tiledRows = fill_up_to_nearest_multiple(rows, STATS_ROWS);
+	int row_tiles = (tiledRows/STATS_ROWS);
+	int col_tiles = (tiledCols/tile_cols);
+	row_tiles = row_tiles > 0 ? row_tiles : 1;
+	col_tiles = col_tiles > 0 ? col_tiles : 1;
+  int num_blocks = row_tiles * col_tiles;
+
+  if(nnz_threshold == 0.0)
+   hipLaunchKernelGGL(( kgetColRowStats<half, STATS_THREADS, STATS_ITEMS, STATS_ROWS, STATS_THREADS*STATS_ITEMS, 0>), dim3(num_blocks), dim3(STATS_THREADS), 0, 0, A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols, tiledRows, tiledCols);
+  else if(nnz_threshold != 0.0)
+   hipLaunchKernelGGL(( kgetColRowStats<half, STATS_THREADS, STATS_ITEMS, STATS_ROWS, STATS_THREADS*STATS_ITEMS, 1>), dim3(num_blocks), dim3(STATS_THREADS), 0, 0, A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols, tiledRows, tiledCols);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+
+}
+
+void doubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols)
+{
+  int threads = 64;
+  int items_per_thread = 4;
+  int tile_cols = threads*items_per_thread;
+  int tile_rows = 16;
+  int tiledCols = fill_up_to_nearest_multiple(cols, tile_cols);
+  int tiledRows = fill_up_to_nearest_multiple(rows, tile_rows);
+	int row_tiles = (tiledRows/tile_rows);
+	int col_tiles = (tiledCols/tile_cols);
+	row_tiles = row_tiles > 0 ? row_tiles : 1;
+	col_tiles = col_tiles > 0 ? col_tiles : 1;
+  int num_blocks = row_tiles * col_tiles;
+
+
+  if(threshold > 0.0f)
+   hipLaunchKernelGGL(( kDoubleRowColQuant<64, 4, 16, 64*4, 1>), dim3(num_blocks), dim3(threads), 0, 0, A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_block_ptr, threshold, rows, cols, tiledCols);
+  else
+   hipLaunchKernelGGL(( kDoubleRowColQuant<64, 4, 16, 64*4, 0>), dim3(num_blocks), dim3(threads), 0, 0, A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_block_ptr, threshold, rows, cols, tiledCols);
+
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *out, int rows, int cols)
+{
+  int threads = 256;
+  int items_per_thread = 8;
+  // we load 128 column values per warp
+  int tile_cols = 32*items_per_thread;
+  int tile_rows = 32;
+  int tiledCols = fill_up_to_nearest_multiple(cols, tile_cols);
+  int tiledRows = fill_up_to_nearest_multiple(rows, tile_rows);
+	int row_tiles = (tiledRows/tile_rows);
+	int col_tiles = (tiledCols/tile_cols);
+	row_tiles = row_tiles > 0 ? row_tiles : 1;
+	col_tiles = col_tiles > 0 ? col_tiles : 1;
+  int num_blocks = row_tiles * col_tiles;
+
+  int outCols = fill_up_to_nearest_multiple(cols, 32);
+  int outRows = fill_up_to_nearest_multiple(rows, 32);
+  if(FORMAT == COL_TURING)
+  {
+    if(TRANSPOSE)
+      outRows = fill_up_to_nearest_multiple(cols, 8);
+    else
+      outRows = fill_up_to_nearest_multiple(rows, 8);
+  }
+  else if(FORMAT == COL_AMPERE)
+  {
+    if(TRANSPOSE)
+      outRows = fill_up_to_nearest_multiple(cols, 32);
+    else
+      outRows = fill_up_to_nearest_multiple(rows, 32);
+  }
+  else
+  {
+    if(TRANSPOSE)
+    {
+      outCols = fill_up_to_nearest_multiple(rows, 32);
+      outRows = cols;
+    }
+  }
+
+ hipLaunchKernelGGL(( kTransformRowToFormat<256, 8, 32, 32*8, TRANSPOSE, FORMAT>), dim3(num_blocks), dim3(threads), 0, 0, A, out, rows, cols, tiledCols, outRows, outCols);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
+{
+
+#ifdef NO_CUBLASLT
+#else
+
+    hipsparseSpMatDescr_t descA;
+    hipsparseDnMatDescr_t descB, descC;
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    void *dBuffer = NULL;
+    size_t bufferSize = 0;
+
+    CHECK_CUSPARSE( hipsparseCreateCoo(&descA, A_rows, A_cols, A_nnz,
+                                      A_rowidx, A_colidx, A_vals,
+                                      HIPSPARSE_INDEX_32I,
+                                      HIPSPARSE_INDEX_BASE_ZERO, HIP_R_16F) );
+    // Create dense matrix C
+    CHECK_CUSPARSE( hipsparseCreateDnMat(&descC, A_rows, B_cols, ldc, C,
+                                        HIP_R_16F, CUSPARSE_ORDER_ROW) );
+    // Create dense matrix B
+    if(transposed_B)
+    {
+      int tmp = A_cols;
+      A_cols = B_cols;
+      B_cols = tmp;
+    }
+
+    CHECK_CUSPARSE( hipsparseCreateDnMat(&descB, A_cols, B_cols, ldb, B,
+                                        HIP_R_16F, CUSPARSE_ORDER_ROW) );
+    // allocate an external buffer if needed
+    CHECK_CUSPARSE( hipsparseSpMM_bufferSize(
+                                 handle,
+                                 HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                 transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                 &alpha, descA, descB, &beta, descC, HIP_R_32F,
+                                 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) );
+    CUDA_CHECK_RETURN( hipMalloc(&dBuffer, bufferSize) );
+
+    // execute SpMM
+    CHECK_CUSPARSE( hipsparseSpMM(handle,
+                                 HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                 transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                 &alpha, descA, descB, &beta, descC, HIP_R_32F,
+                                 CUSPARSE_SPMM_ALG_DEFAULT, dBuffer));
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE( hipsparseDestroySpMat(descA) );
+    CHECK_CUSPARSE( hipsparseDestroyDnMat(descB) );
+    CHECK_CUSPARSE( hipsparseDestroyDnMat(descC) );
+    CUDA_CHECK_RETURN( hipFree(dBuffer) );
+#endif
+}
+
+template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
+{
+
+ hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+
+template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols)
+{
+  int threads = 256;
+  // we load 128 column values per warp
+  int tiledCols = tiledCols = fill_up_to_nearest_multiple(cols, 32);
+  int tiledRows = 0;
+
+	int num_blocks = idx_size;
+
+  if(FORMAT == COL_TURING)
+  {
+      tiledRows = fill_up_to_nearest_multiple(rows, 8);
+  }
+  else if(FORMAT == COL_AMPERE)
+  {
+      tiledRows = fill_up_to_nearest_multiple(rows, 32);
+	}
+
+ hipLaunchKernelGGL(( kExtractOutliers<FORMAT>), dim3(num_blocks), dim3(threads), 0, 0, A, idx, out, idx_size, rows, cols, tiledRows, tiledCols);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+
+
+
+template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits)
+{
+
+	int num_blocks = (m+31)/32;
+
+	//cout << num_blocks << endl;
+	//cout << lda << endl;
+	//cout << ldb << endl;
+	//cout << ldc << endl;
+
+	//cout << m << endl;
+	//cout << n << endl;
+	//cout << k << endl;
+  //if(bits == 32)
+    //gemm_device<T, 32, 128><<< num_blocks, 128, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+    //gemm_device<T, 32, 32><<< num_blocks, 32, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+  if(bits == 16)
+    //gemm_device<T, 16, 256><<< num_blocks, 256, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+   hipLaunchKernelGGL(( gemm_device<T, 16, 160>),  dim3(num_blocks), dim3(160), 0, 0 , m,  n,  k, A,  B,  out, lda, ldb, ldc);
+    //gemm_device<T, 16, 128><<< num_blocks, 128, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+    //gemm_device<T, 16, 96><<< num_blocks, 96, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+    //gemm_device<T, 16, 32><<< num_blocks, 32, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+    //gemm_device<T, 16, 64><<< num_blocks, 64, 0, 0 >>>(m,  n,  k, A,  B,  out, lda, ldb, ldc);
+}
+
+template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
+{
+
+	int num_blocks = (m+31)/32;
+
+	//cout << num_blocks << endl;
+	//cout << lda << endl;
+	//cout << ldb << endl;
+	//cout << ldc << endl;
+
+	//cout << m << endl;
+	//cout << n << endl;
+	//cout << k << endl;
+ hipLaunchKernelGGL(( kgemm_4bit_inference<T, 160>),  dim3(num_blocks), dim3(160), 0, 0 , m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
+  //kgemm_4bit_inference<T, 32><<< num_blocks, 32, 0, 0 >>>(m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
+}
+
+template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
+{
+  int threads = 512;
+  int blocks = n/threads;
+  blocks = n % threads == 0 ? blocks : blocks + 1;
+  blocks = blocks > 65535 ? 65535 : blocks;
+ hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
+//==============================================================
+//                   TEMPLATE DEFINITIONS
+//==============================================================
+
+template void func<float, FILL>(float *A, float *B, float value, long n);
+template void func<unsigned char, FILL>(unsigned char *A, unsigned char *B, unsigned char value, long n);
+template void func<float, ARANGE>(float *A, float *B, float value, long n);
+template void func<float, _MUL>(float *A, float *B, float value, long n);
+
+template void gemm_4bit_inference<half>(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+//template void gemm_host<float>(int m, int n, int k, float * A,  float* B,  float * out,  int lda, int ldb, int ldc, int bits);
+template void gemm_host<half>(int m, int n, int k, half * A,  half* B,  half * out,  int lda, int ldb, int ldc, int bits);
+template void extractOutliers<COL_TURING>(char * A, int *idx, char *out, int idx_size, int rows, int cols);
+template void extractOutliers<COL_AMPERE>(char * A, int *idx, char *out, int idx_size, int rows, int cols);
+
+template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
+template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
+
+template int igemmlt<COL_TURING, 32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_TURING, 8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_TURING, 8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+
+template void transformRowToFormat<COL32, 0>(char * A, char *out, int rows, int cols);
+template void transformRowToFormat<COL32, 1>(char * A, char *out, int rows, int cols);
+template void transformRowToFormat<COL_TURING, 0>(char * A, char *out, int rows, int cols);
+template void transformRowToFormat<COL_TURING, 1>(char * A, char *out, int rows, int cols);
+template void transformRowToFormat<COL_AMPERE, 0>(char * A, char *out, int rows, int cols);
+template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows, int cols);
+
+template void estimateQuantiles(half *A, float *code, float offset, int n);
+template void estimateQuantiles(float *A, float *code, float offset, int n);
+
+template void quantizeBlockwise<half, 1, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 1, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<half, 0, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 0, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<half, 0, FP4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 0, FP4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<half, 0, NF4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 0, NF4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<float, General8bit>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
+template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<float, FP4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
+template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<float, NF4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
+
+#define MAKE_optimizer32bit(name, gtype) \
+template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
+                float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
+                const float beta1, const float beta2, const float eps, const float weight_decay, \
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+
+MAKE_optimizer32bit(ADAM, half)
+MAKE_optimizer32bit(ADAM, float)
+MAKE_optimizer32bit(ADAM, __nv_bfloat16)
+MAKE_optimizer32bit(MOMENTUM, half)
+MAKE_optimizer32bit(MOMENTUM, float)
+MAKE_optimizer32bit(RMSPROP, half)
+MAKE_optimizer32bit(RMSPROP, float)
+MAKE_optimizer32bit(LION, half)
+MAKE_optimizer32bit(LION, float)
+MAKE_optimizer32bit(LION, __nv_bfloat16)
+MAKE_optimizer32bit(ADAGRAD, half)
+MAKE_optimizer32bit(ADAGRAD, float)
+
+#define MAKE_optimizerStatic8bit(name, gtype) \
+template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+                float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, \
+                const float gnorm_scale, int n); \
+
+MAKE_optimizerStatic8bit(ADAM, half)
+MAKE_optimizerStatic8bit(ADAM, float)
+MAKE_optimizerStatic8bit(MOMENTUM, half)
+MAKE_optimizerStatic8bit(MOMENTUM, float)
+MAKE_optimizerStatic8bit(RMSPROP, half)
+MAKE_optimizerStatic8bit(RMSPROP, float)
+MAKE_optimizerStatic8bit(LION, half)
+MAKE_optimizerStatic8bit(LION, float)
+
+#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
+template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \
+
+MAKE_optimizerStatic8bitBlockwise(half, ADAM);
+MAKE_optimizerStatic8bitBlockwise(float, ADAM);
+MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
+MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
+MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
+MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
+MAKE_optimizerStatic8bitBlockwise(half, LION);
+MAKE_optimizerStatic8bitBlockwise(float, LION);
+MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, LION);
+MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
+MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
+
+template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
+template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
+
+MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, ADAM);
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
new file mode 100644
index 000000000..cddc6d913
--- /dev/null
+++ b/csrc/ops_hip.cuh
@@ -0,0 +1,207 @@
+// !!! This is a file automatically generated by hipify!!!
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#ifndef ops_H
+#define ops_H
+
+#include <stdio.h>
+#include <iostream>
+#include <unistd.h>
+#include <assert.h>
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+#include <rocblas/rocblas.h>
+#include <cublasLt.h>
+#include <hipsparse/hipsparse.h>
+#include <vector>
+#include <functional>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+
+
+#define CUDA_CHECK_RETURN(value) {                      \
+  hipError_t _m_cudaStat = value;                    \
+  if (_m_cudaStat != hipSuccess) {                   \
+    fprintf(stderr, "Error %s at line %d in file %s\n",         \
+        hipGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
+    exit(1);                              \
+  } }
+
+#define THREADS_PER_BLOCKS (512)
+
+#define CHECK_CUSPARSE(value) {                      \
+  hipsparseStatus_t _m_cudaStat = value;                    \
+  if (_m_cudaStat != HIPSPARSE_STATUS_SUCCESS) {                   \
+    fprintf(stderr, "Error %s at line %d in file %s\n",         \
+        cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
+    exit(1);                              \
+  } }
+
+
+#define THREADS_PER_BLOCKS (512)
+
+
+inline void checkCudaStatus(hipError_t status) {
+    if (status != hipSuccess) {
+        printf("cuda API failed with status %d: %s\n", status, hipGetErrorString(status));
+        throw std::logic_error("cuda API failed");
+    }
+}
+
+inline int checkCublasStatus(rocblas_status status) {
+    if (status != rocblas_status_success) {
+        printf("cuBLAS API failed with status %d\n", status);
+        //throw std::logic_error("cuBLAS API failed");
+        return 1;
+    }
+    return 0;
+}
+
+typedef enum Operations_t
+{
+	ksmul = 0,
+} Operations_t;
+
+typedef enum Optimizer_t
+{
+	ADAM = 0,
+	MOMENTUM = 1,
+  RMSPROP = 2,
+  LARS = 3,
+  ADAGRAD = 4,
+  LION = 5,
+} Optimizer_t;
+
+typedef enum Transform_t
+{
+	ROW = 0,
+	COL = 1,
+  COL32 = 2,
+  COL_TURING = 3,
+  COL_AMPERE = 4,
+} Transform_t;
+
+typedef enum DataType_t
+{
+	General8bit = 0,
+	FP4 = 1,
+  NF4 = 2,
+} DataType_t;
+
+typedef enum Funcs_t
+{
+	FILL = 0,
+	ARANGE = 1,
+	_MUL = 2,
+} Funcs_t;
+
+class Context
+{
+    public:
+				rocblas_handle m_handle;
+
+				Context()
+				{
+					rocblas_handle handle;
+					rocblas_create_handle(&handle);
+					m_handle = handle;
+				}
+
+};
+
+class ContextLt
+{
+    public:
+				cublasLtHandle_t m_handle;
+
+				ContextLt()
+				{
+					cublasLtHandle_t handle;
+					cublasLtCreate(&handle);
+					m_handle = handle;
+				}
+
+};
+
+class ContextCusparse
+{
+    public:
+				hipsparseHandle_t m_handle;
+
+				ContextCusparse()
+				{
+					hipsparseHandle_t handle;
+					hipsparseCreate(&handle);
+					m_handle = handle;
+				}
+
+};
+
+
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n);
+
+void quantize(float *code, float *A, unsigned char *out, int n);
+void dequantize(float *code, unsigned char *A, float *out, int n);
+template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
+
+template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
+                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2, float eps, float weight_decay,
+                int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
+                float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2,
+                float eps, int step, float lr,
+                float* quantiles1, float* quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay,
+                const float gnorm_scale, int n);
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
+								bool skip_zeros, int n);
+
+template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
+
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
+
+void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
+void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
+                    long long int strideA, long long int strideB, long long int strideC, int batchCount);
+
+
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+
+template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
+void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols);
+void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols);
+void doubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed,
+                       int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols);
+
+template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *out, int rows, int cols);
+
+void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B);
+
+template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
+
+template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols);
+
+void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB);
+
+template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits);
+template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+
+template <typename T, int FUNC> void func(T *A, T *B, T value, long n);
+
+#endif
diff --git a/csrc/pythonInterface_hip.c b/csrc/pythonInterface_hip.c
new file mode 100644
index 000000000..21dab4580
--- /dev/null
+++ b/csrc/pythonInterface_hip.c
@@ -0,0 +1,375 @@
+// !!! This is a file automatically generated by hipify!!!
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#if BUILD_CUDA
+#include <ops.cuh>
+#endif
+#include <cpu_ops.h>
+
+// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
+// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
+// maintain all that boilerplate
+//===================================================================================
+//                               UNMANGLED CALLS
+//===================================================================================
+
+#if BUILD_CUDA
+void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
+void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
+
+
+//void gemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
+//{ gemm_host<float>(M, N, K, A, B, out, lda, ldb, ldc, 32); }
+void gemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
+{ gemm_host<half>(M, N, K, A, B, out, lda, ldb, ldc, 16); }
+
+void gemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
+{ gemm_4bit_inference<half>(m, n, k, A, B, absmax,  out, lda, ldb, ldc, blocksize); }
+
+#define MAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
+void fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ func<ctype, FUNC>(A, B, value, n); } \
+
+MAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
+MAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
+MAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
+MAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
+
+
+#define MAKE_FUNC32(fname, oname, gtype, gbits) \
+void fname##32bit_grad_##gbits(gtype *g, gtype *p, \
+               float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
+               const float beta1, const float beta2, const float eps, const float weight_decay, \
+               const int step, const float lr, float gnorm_scale, bool skip_zeros, const int n) \
+{ optimizer32bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
+
+MAKE_FUNC32(momentum, MOMENTUM, float, 32)
+MAKE_FUNC32(momentum, MOMENTUM, half, 16)
+MAKE_FUNC32(adam, ADAM, float, fp32)
+MAKE_FUNC32(adam, ADAM, half, fp16)
+MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16)
+MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
+MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
+MAKE_FUNC32(lion, LION, float, fp32)
+MAKE_FUNC32(lion, LION, half, fp16)
+MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16)
+MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
+MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
+
+#define MAKE_FUNC8(fname, oname, gtype, gbits) \
+void fname##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+								float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, float gnorm_scale, int n) \
+{  \
+	optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
+			                                  quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
+} \
+
+MAKE_FUNC8(adam, ADAM, float, 32)
+MAKE_FUNC8(adam, ADAM, half, 16)
+MAKE_FUNC8(momentum, MOMENTUM, float, 32)
+MAKE_FUNC8(momentum, MOMENTUM, half, 16)
+MAKE_FUNC8(rmsprop, RMSPROP, float, 32)
+MAKE_FUNC8(rmsprop, RMSPROP, half, 16)
+MAKE_FUNC8(lion, LION, float, 32)
+MAKE_FUNC8(lion, LION, half, 16)
+
+#define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \
+void fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)\
+{	optimizerStatic8bitBlockwise<gtype, optim_name>(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); }\
+
+MAKE_BLOCKWISE8(adam, ADAM, half, fp16)
+MAKE_BLOCKWISE8(adam, ADAM, float, fp32)
+MAKE_BLOCKWISE8(momentum, MOMENTUM, half, fp16)
+MAKE_BLOCKWISE8(momentum, MOMENTUM, float, fp32)
+MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
+MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
+MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
+MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
+MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+MAKE_BLOCKWISE8(lion, LION, half, fp16)
+MAKE_BLOCKWISE8(lion, LION, float, fp32)
+MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+
+
+void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
+void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
+
+void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+
+void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, General8bit>(code, A, absmax, out, blocksize, n); } \
+void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, General8bit>(code, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, FP4>(NULL, A, absmax, out, blocksize, n); } \
+void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, FP4>(NULL, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, NF4>(NULL, A, absmax, out, blocksize, n); } \
+void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
+
+
+#define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
+void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
+{ \
+	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
+} \
+
+MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
+MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
+MAKE_FUNC_TRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8);
+MAKE_FUNC_TRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32);
+MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
+MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
+MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
+MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
+
+void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
+void transform_row2col32T(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 1>(A, out, rows, cols); }
+void transform_row2turing(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 0>(A, out, rows, cols); }
+void transform_row2turingT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 1>(A, out, rows, cols); }
+void transform_row2ampere(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 0>(A, out, rows, cols); }
+void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 1>(A, out, rows, cols); }
+
+void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
+void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
+
+ int igemmlt_turing_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_turing_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_turing_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
+{ spmm_coo_very_sparse_naive<half, 16>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
+
+void spmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
+{ spmm_coo_very_sparse_naive<signed char, 8>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
+#endif
+
+extern "C"
+{
+#if BUILD_CUDA
+	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
+	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
+	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
+	void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
+  void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
+
+  void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
+
+  void cquantize_blockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
+
+	#define MAKE_CFUNC32(name, gtype, gbits) \
+	void c##name##32bit_grad_##gbits(gtype *g, gtype *p, \
+								 float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
+								 const float beta1, const float beta2, const float eps, const float weight_decay, \
+								 const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \
+	{ name##32bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
+
+	MAKE_CFUNC32(adam, float, fp32)
+	MAKE_CFUNC32(adam, half, fp16)
+	MAKE_CFUNC32(adam, __nv_bfloat16, bf16)
+	MAKE_CFUNC32(momentum, float, 32)
+	MAKE_CFUNC32(momentum, half, 16)
+	MAKE_CFUNC32(rmsprop, float, 32)
+	MAKE_CFUNC32(rmsprop, half, 16)
+	MAKE_CFUNC32(lion, float, fp32)
+	MAKE_CFUNC32(lion, half, fp16)
+	MAKE_CFUNC32(lion, __nv_bfloat16, bf16)
+	MAKE_CFUNC32(adagrad, float, 32)
+	MAKE_CFUNC32(adagrad, half, 16)
+
+	#define MAKE_CFUNC8(name, gtype, gbits) \
+	void c##name##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+                float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, float gnorm_scale, int n) \
+  {  \
+	    name##_static_8bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
+			                                 quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
+  } \
+
+	MAKE_CFUNC8(adam, float, 32)
+	MAKE_CFUNC8(adam, half, 16)
+	MAKE_CFUNC8(momentum, float, 32)
+	MAKE_CFUNC8(momentum, half, 16)
+	MAKE_CFUNC8(rmsprop, float, 32)
+	MAKE_CFUNC8(rmsprop, half, 16)
+	MAKE_CFUNC8(lion, float, 32)
+	MAKE_CFUNC8(lion, half, 16)
+
+  #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
+  void c##fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \
+  {	fname##_8bit_blockwise_grad_##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \
+
+	MAKE_CBLOCKWISE8(adam, ADAM, half, fp16)
+	MAKE_CBLOCKWISE8(adam, ADAM, float, fp32)
+	MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, fp16)
+	MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, fp32)
+	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, fp16)
+	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
+	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
+	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
+	MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
+	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
+	MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+
+	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
+	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
+	void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
+
+	void cigemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc)
+	{ gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc); }
+	void cbatched_igemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
+			               long strideA, long strideB, long strideC, int batchCount)
+	{ strided_gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc, strideA, strideB, strideC, batchCount); }
+
+	Context *get_context(){ return new Context(); }
+	ContextCusparse *get_cusparse(){ return new ContextCusparse(); }
+
+	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+	//{ (cublasLtHandle_t)context->m_handle; return 0; }
+	//{ return 0; }//igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_turing_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_turing_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
+	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
+	{ \
+		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
+	} \
+
+	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
+	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
+	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
+	MAKE_FUNC_CTRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32)
+	MAKE_FUNC_CTRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8)
+	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
+	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
+	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
+
+	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
+	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
+	void cget_col_row_stats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
+	{ getColRowStats(A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols); }
+
+  void cdouble_rowcol_quant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_row_ptr, float threshold, int rows, int cols)
+	{ doubleRowColQuant(A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_row_ptr, threshold, rows, cols); }
+
+	void ctransform_row2col32(char * A, char *out, int rows, int cols)
+	{ transform_row2col32(A, out, rows, cols); }
+
+	void ctransform_row2col32T(char * A, char *out, int rows, int cols)
+	{ transform_row2col32T(A, out, rows, cols); }
+
+	void ctransform_row2turing(char * A, char *out, int rows, int cols)
+	{ transform_row2turing(A, out, rows, cols); }
+
+	void ctransform_row2turingT(char * A, char *out, int rows, int cols)
+	{ transform_row2turingT(A, out, rows, cols); }
+
+	void ctransform_row2ampere(char * A, char *out, int rows, int cols)
+	{ transform_row2ampere(A, out, rows, cols); }
+
+	void ctransform_row2ampereT(char * A, char *out, int rows, int cols)
+	{ transform_row2ampereT(A, out, rows, cols); }
+
+	void cspmm_coo(ContextCusparse *context, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
+  { spmm_coo((hipsparseHandle_t) context->m_handle, A_rowidx, A_colidx, A_vals, A_nnz, A_rows, A_cols, B_cols, ldb, B, ldc, C, transposed_B); }
+
+	void cspmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
+	{ spmm_coo_very_sparse_naive_fp16(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
+
+	void cspmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
+	{ spmm_coo_very_sparse_naive_int8(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
+
+	void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); }
+	void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); }
+
+	//void cgemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
+	//{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); }
+
+	void cgemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
+	{ gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); }
+
+	void cgemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
+	{ gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); }
+
+	void *cget_managed_ptr(size_t bytes)
+	{
+		void *ptr;
+		CUDA_CHECK_RETURN(hipMallocManaged(&ptr, bytes, hipMemAttachHost));
+		CUDA_CHECK_RETURN(hipPeekAtLastError());
+
+		return ptr;
+	}
+
+	void cprefetch(void *ptr, size_t bytes, int device)
+	{
+		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
+		CUDA_CHECK_RETURN(hipPeekAtLastError());
+	}
+
+  #define CMAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
+	void c##fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ fname##_##type_name(A, B, value, n); } \
+
+	CMAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
+	CMAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
+	CMAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
+	CMAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
+
+#endif
+	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
+	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
+}

From 19289600f311ab7017c5fac3a1f1d1117f684bcd Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Sat, 8 Jul 2023 21:58:04 -0500
Subject: [PATCH 002/234] hipify pythoninterface

---
 csrc/pythonInterface.c | 113 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 23a0364cc..56b322e72 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -6,6 +6,9 @@
 #if BUILD_CUDA
 #include <ops.cuh>
 #endif
+#if BUILD_HIP
+#include <ops.hiph>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -15,7 +18,7 @@
 //                               UNMANGLED CALLS
 //===================================================================================
 
-#if BUILD_CUDA
+#if defined(BUILD_CUDA) || defined(BUILD_HIP)
 void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
 void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
 
@@ -116,12 +119,23 @@ void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax,
 void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, NF4>(NULL, A, absmax, out, blocksize, n); } \
 void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
 
+#ifndef NO_HIPBLASLT
 
+#if BUILD_CUDA
 #define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
 void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
 { \
 	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
 } \
+#endif
+
+#if BUILD_HIP
+#define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
+void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(hipblasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
+{ \
+	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
+} \
+#endif
 
 MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
@@ -131,6 +145,7 @@ MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
 MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
+#endif
 
 void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
 void transform_row2col32T(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 1>(A, out, rows, cols); }
@@ -142,6 +157,9 @@ void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRo
 void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
 void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
 
+#ifndef NO_HIPBLASLT
+
+#if BUILD_CUDA
  int igemmlt_turing_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
 
@@ -159,6 +177,29 @@ void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int row
 
  int igemmlt_ampere_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+#endif
+
+#if BUILD_HIP
+ int igemmlt_turing_32(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_turing_8(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_turing_8_rowscale(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_32(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_8(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+ int igemmlt_ampere_8_rowscale(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+#endif
+
+#endif
 
 void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
 { spmm_coo_very_sparse_naive<half, 16>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
@@ -169,7 +210,7 @@ void spmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_r
 
 extern "C"
 {
-#if BUILD_CUDA
+#if defined(BUILD_CUDA) || defined(BUILD_HIP)
 	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
 	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
 	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
@@ -261,8 +302,18 @@ extern "C"
 	{ strided_gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc, strideA, strideB, strideC, batchCount); }
 
 	Context *get_context(){ return new Context(); }
+#if BUILD_CUDA
 	ContextCusparse *get_cusparse(){ return new ContextCusparse(); }
+#endif
+
+#if BUILD_HIP
+	ContextHipsparse *get_hipsparse(){ return new ContextHipsparse(); }
+#endif
+
 
+#ifndef NO_HIPBLASLT
+
+#if BUILD_CUDA
 	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
 	//{ (cublasLtHandle_t)context->m_handle; return 0; }
@@ -288,6 +339,38 @@ extern "C"
 	{ \
 		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
 	} \
+#endif
+
+#if BUILD_CUDA
+	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+	//{ (hipblasLtHandle_t)context->m_handle; return 0; }
+	//{ return 0; }//igemmlt_turing_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_turing_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_8((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_turing_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_turing_8_rowscale((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_8_rowscale((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+	int cigemmlt_ampere_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+	{ return igemmlt_ampere_8((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
+
+  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
+	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
+	{ \
+		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((hipblasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
+	} \
+#endif
+
+#endif
+
 
 	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
@@ -324,8 +407,15 @@ extern "C"
 	void ctransform_row2ampereT(char * A, char *out, int rows, int cols)
 	{ transform_row2ampereT(A, out, rows, cols); }
 
+#if BUILD_CUDA
 	void cspmm_coo(ContextCusparse *context, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
   { spmm_coo((cusparseHandle_t) context->m_handle, A_rowidx, A_colidx, A_vals, A_nnz, A_rows, A_cols, B_cols, ldb, B, ldc, C, transposed_B); }
+#endif
+
+#if BUILD_HIP
+	void cspmm_coo(ContextHipsparse *context, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
+  { spmm_coo((hipsparseHandle_t) context->m_handle, A_rowidx, A_colidx, A_vals, A_nnz, A_rows, A_cols, B_cols, ldb, B, ldc, C, transposed_B); }
+#endif
 
 	void cspmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
 	{ spmm_coo_very_sparse_naive_fp16(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
@@ -345,6 +435,7 @@ extern "C"
 	void cgemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
 	{ gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); }
 
+#if BUILD_CUDA
 	void *cget_managed_ptr(size_t bytes)
 	{
 		void *ptr;
@@ -359,6 +450,24 @@ extern "C"
 		CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(cudaPeekAtLastError());
 	}
+#endif
+
+#if BUILD_HIP
+	void *cget_managed_ptr(size_t bytes)
+	{
+		void *ptr;
+		CUDA_CHECK_RETURN(hipMallocManaged(&ptr, bytes, hipMemAttachHost));
+		CUDA_CHECK_RETURN(hipPeekAtLastError());
+
+		return ptr;
+	}
+
+	void cprefetch(void *ptr, size_t bytes, int device)
+	{
+		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
+		CUDA_CHECK_RETURN(hipPeekAtLastError());
+	}
+#endif
 
   #define CMAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
 	void c##fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ fname##_##type_name(A, B, value, n); } \

From 8ca0b5ca77ea3ba44e3f3e0b378d5e2cb6caa11c Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Sat, 8 Jul 2023 21:59:55 -0500
Subject: [PATCH 003/234] copy from agrocylo

---
 include/Algo-Direct2.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..3d2de1b35 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -93,8 +93,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
 #endif
         IVec<SSE, float> i(u.vec);
-        IVec<SSE, float> vlem = vz < vxm;
-        IVec<SSE, float> vlep = vz < vxp;
+        IVec<SSE, float> vlem = operator< (vz, vxm);
+        IVec<SSE, float> vlep = operator< (vz, vxp);
         i = i + vlem + vlep;
         i.store(pr);
     }
@@ -123,8 +123,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3);
 
         IVec<SSE, double> i(b1, b0);
-        IVec<SSE, double> vlem = (vz < vxm);
-        IVec<SSE, double> vlep = (vz < vxp);
+        IVec<SSE, double> vlem = operator< (vz, vxm);
+        IVec<SSE, double> vlep = operator< (vz, vxp);
         i = i + vlem + vlep;
 
         union {

From 8acbcf24f250376b0e583a717b8eea4df2286dda Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Sat, 8 Jul 2023 23:36:10 -0500
Subject: [PATCH 004/234] hipify cuparse and cublas calls

---
 bitsandbytes/autograd/_functions.py | 4 ++++
 bitsandbytes/cextension.py          | 8 +++++++-
 bitsandbytes/cuda_setup/main.py     | 2 ++
 bitsandbytes/functional.py          | 6 +++++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index c2298c8ed..bd3e8fab2 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -223,6 +223,10 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
+    """Important: Could I use igemmlt on ROCm? """
+    if torch.version.hip:
+        #Well, lets currently disable it
+        return False
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 131edc5ee..365cfb579 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -25,9 +25,15 @@
         Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
         to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
         and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''')
+
     lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False
     lib.get_context.restype = ct.c_void_p
-    lib.get_cusparse.restype = ct.c_void_p
+    
+    if torch.version.cuda:
+        lib.get_cusparse.restype = ct.c_void_p
+    elif torch.version.hip:
+        lib.get_hipsparse.restype = ct.c_void_p
+
     lib.cget_managed_ptr.restype = ct.c_void_p
     COMPILED_WITH_CUDA = True
 except AttributeError as ex:
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index e7901d82e..295444f59 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -381,6 +381,8 @@ def evaluate_cuda_setup():
               ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
         print('='*80)
     if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None, None
+    if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None, None
+
 
     cuda_setup = CUDASetup.get_instance()
     cudart_path = determine_cuda_runtime_lib_path()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index afa346e6e..0b9e1203f 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -150,7 +150,11 @@ def __init__(self):
         raise RuntimeError("Call get_instance() instead")
 
     def initialize(self):
-        self.context = ct.c_void_p(lib.get_cusparse())
+        #self.context = ct.c_void_p(lib.get_cusparse())
+        if torch.version.cuda:
+            self.context = ct.c_void_p(lib.get_cusparse())
+        elif torch.version.hip:
+            self.context = ct.c_void_p(lib.get_hipsparse())
 
     @classmethod
     def get_instance(cls):

From e80a60cda54dee46c46fa027c7817396e02f5072 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Mon, 10 Jul 2023 02:17:41 -0500
Subject: [PATCH 005/234] fix compile error and Makefile

---
 Makefile                                      |  21 +-
 bitsandbytes/cextension.py                    |   2 +-
 csrc/kernels.hip                              | 185 ++++++-----
 csrc/{kernels_hip.cuh => kernels.hiph}        |   2 +-
 csrc/ops.hip                                  | 302 +++++++++---------
 csrc/{ops_hip.cuh => ops.hiph}                |  48 +--
 csrc/pythonInterface.c                        |  29 +-
 ...honInterface_hip.c => test_delete_later.c} |   0
 include/Algo-Direct2.h                        |   8 +-
 9 files changed, 307 insertions(+), 290 deletions(-)
 rename csrc/{kernels_hip.cuh => kernels.hiph} (99%)
 rename csrc/{ops_hip.cuh => ops.hiph} (85%)
 rename csrc/{pythonInterface_hip.c => test_delete_later.c} (100%)

diff --git a/Makefile b/Makefile
index 19b5b91d4..75c27acfb 100644
--- a/Makefile
+++ b/Makefile
@@ -7,14 +7,17 @@ ifeq ($(CUDA_HOME),)
 	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
 endif
 
-ifndef CUDA_VERSION
-$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
-CUDA_VERSION:=
-endif
+ROCM_HOME := /opt/rocm
+
+#ifndef CUDA_VERSION
+#$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
+#CUDA_VERSION:=
+#endif
 
 
 
 NVCC := $(CUDA_HOME)/bin/nvcc
+HIPCC := $(ROCM_HOME)/bin/hipcc
 
 ###########################################
 
@@ -27,6 +30,9 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
 INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
 
+INCLUDE_ROCM := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
+LIB_ROCM := -L $(ROCM_HOME)/lib -lhipblas -lhiprand -lhipsparse -L $(CONDA_PREFIX)/lib
+
 # NVIDIA NVCC compilation flags
 COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
 COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
@@ -100,6 +106,11 @@ cuda12x: $(BUILD_DIR) env
 	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
+hip: $(BUILD_DIR) env
+	$(HIPCC) -std=c++14 -fPIC -c -DNO_HIPBLASLT $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/ops.hip -o $(BUILD_DIR)/ops.o
+	$(HIPCC) -std=c++14 -fPIC -c -DNO_HIPBLASLT $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/kernels.hip -o $(BUILD_DIR)/kernels.o
+	$(GPP) -std=c++14 -D__HIP_PLATFORM_AMD__ -DBUILD_HIP -DNO_HIPBLASLT -shared -fPIC $(INCLUDE_ROCM) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so $(LIB_ROCM)
+
 cpuonly: $(BUILD_DIR) env
 	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
 
@@ -109,8 +120,10 @@ env:
 	@echo "CUDA_VERSION: $(CUDA_VERSION)"
 	@echo "============================"
 	@echo "NVCC path: $(NVCC)"
+	@echo "HIPCC path: $(HIPCC)"
 	@echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`"
 	@echo "CUDA_HOME: $(CUDA_HOME)"
+	@echo "HIP_HOME: $(HIP_HOME)"
 	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
 	@echo "PATH: $(PATH)"
 	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 365cfb579..35c0386b9 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -14,7 +14,7 @@
 
 lib = setup.lib
 try:
-    if lib is None and torch.cuda.is_available():
+    if lib is None and torch.cuda.is_available() :
         CUDASetup.get_instance().generate_instructions()
         CUDASetup.get_instance().print_log_stack()
         raise RuntimeError('''
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 4724f57c0..2bd72504a 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -5,18 +5,14 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <kernels.cuh>
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/warp/warp_reduce.cuh>
+#include <kernels.hiph>
+#include <hip/hip_runtime.h>
 #include <hipcub/hipcub.hpp>
-#include <cub/block/block_discontinuity.cuh>
-#include <cub/block/block_store.cuh>
-#include <hipcub/hipcub.hpp>
-#include <hipcub/hipcub.hpp>
-#include <math_constants.h>
+#include <hip/hip_math_constants.h>
+
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
-#include <mma.h>
+//#include <mma.h>
 
 
 #define HLF_MAX 65504
@@ -26,29 +22,7 @@
 
 
 // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-__device__ float atomicMax(float* address, float val) {
-  int* address_as_i = reinterpret_cast<int*>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(
-        reinterpret_cast<int*>(address), assumed,
-        __float_as_int(fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-__device__ float atomicMin(float* address, float val) {
-  int* address_as_i = reinterpret_cast<int*>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(
-        reinterpret_cast<int*>(address), assumed,
-        __float_as_int(fminf(val, __int_as_float(assumed))));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
+// Luckily we have atomicmax and atomicmin in ROCm 
 
 __device__ float dDequantizeFP4(unsigned char val, float absmax)
 {
@@ -527,7 +501,7 @@ __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* ou
 {
   typedef hipcub::WarpReduce<T> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage;
-  typedef cub::BlockLoad<T, BLOCK_SIZE/8 , 8, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+  typedef hipcub::BlockLoad<T, BLOCK_SIZE/8 , 8, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
   __shared__ typename LoadT::TempStorage loadt;
 
   const int warp_idx = threadIdx.x/32;
@@ -576,7 +550,7 @@ __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* ou
   {
     // 3. do warp reduction + broadcast back
     warp_max = WarpReduce(temp_storage).Reduce(max1, hipcub::Max());
-    warp_max = cub::ShuffleIndex<32>(warp_max, 0, 0xffffffff);
+    warp_max = hipcub::ShuffleIndex<32>(warp_max, 0, 0xffffffff);
 
     // 4. Up-shift maxed value, write index into shared memory, replace with 2nd largest
     if(warp_max == max1)
@@ -590,7 +564,7 @@ __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* ou
 
       max2 = -64000.0f;
     }
-    __syncwarp();
+    //__syncwarp();
   }
 
   if(threadIdx.x % 32 < 8)
@@ -617,8 +591,8 @@ __global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const f
 
   T vals[NUM_ESTIMATE];
 
-  typedef cub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, cub::NullType, 4, true, cub::BLOCK_SCAN_RAKING> BlockRadixSort;
-  typedef cub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::NullType, 4, true, hipcub::BLOCK_SCAN_RAKING> BlockRadixSort;
+  typedef hipcub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
 
   __shared__ union {
       typename LoadFloat::TempStorage loadf;
@@ -684,8 +658,8 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
   unsigned char qvals[NUM];
   //const int lane_id = threadIdx.x % 2;
 
-  typedef cub::BlockLoad<float, TH, NUM, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
-  typedef cub::BlockStore<unsigned char, TH, NUM, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+  typedef hipcub::BlockLoad<float, TH, NUM, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockStore<unsigned char, TH, NUM, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
 
   __shared__ typename LoadFloat::TempStorage loadf;
   __shared__ typename StoreChar::TempStorage storec;
@@ -735,10 +709,10 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
   float local_abs_max = 0.0f;
   int local_rand_idx = 0;
 
-  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-  typedef cub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+  typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+  typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
   typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_PER_TH> BlockReduce;
-  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockLoad<float, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
 
   __shared__ typename LoadT::TempStorage loadt;
   __shared__ typename LoadFloat::TempStorage loadf;
@@ -779,7 +753,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
     else
       local_abs_max = smem_absmax_value[0];
 
-    __syncwarp();
+    //__syncwarp();
 
     local_abs_max = 1.0f/local_abs_max;
 
@@ -840,8 +814,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
   unsigned char qvals[NUM_PER_TH];
   float local_abs_max = -FLT_MAX;
 
-  typedef cub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
-  typedef cub::BlockStore<T, THREADS, NUM_PER_TH*((DATA_TYPE > 0) ? 2 : 1), cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+  typedef hipcub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+  typedef hipcub::BlockStore<T, THREADS, NUM_PER_TH*((DATA_TYPE > 0) ? 2 : 1), hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
   __shared__ typename LoadChar::TempStorage loadchar;
   __shared__ typename StoreT::TempStorage storet;
@@ -935,8 +909,8 @@ __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
   const float correction1 = 1.0f/(1.0f - powf(beta1, step));
   const float correction2 = 1.0f/(1.0f - powf(beta2, step));
 
-  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
-  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef hipcub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
   typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
 
   __shared__ union {
@@ -986,7 +960,7 @@ __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
       if(threadIdx.x == 0)
         atomicAdd(&unorm[0], s1_vals[0]);
 
-      __syncwarp();
+      //__syncwarp();
   }
 }
 
@@ -1024,11 +998,11 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
   }
   else{ update_scale = 1.0f; }
 
-  typedef cub::BlockLoad<T, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
-  typedef cub::BlockStore<T, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> Store;
+  typedef hipcub::BlockLoad<T, TH, NUM_PER_THREAD, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef hipcub::BlockStore<T, TH, NUM_PER_THREAD, hipcub::BLOCK_STORE_WARP_TRANSPOSE> Store;
 
-  typedef cub::BlockLoad<float, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
-  typedef cub::BlockStore<float, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
+  typedef hipcub::BlockLoad<float, TH, NUM_PER_THREAD, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockStore<float, TH, NUM_PER_THREAD, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
 
   __shared__ union {
       typename Load::TempStorage load;
@@ -1098,8 +1072,8 @@ __global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
 
   float s1_vals[NUM_VALS];
 
-  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
-  typedef cub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef hipcub::BlockLoad<float, BLOCK_SIZE/NUM_VALS, NUM_VALS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
   typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
 
   __shared__ union {
@@ -1159,7 +1133,7 @@ __global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
       if(threadIdx.x == 0)
         atomicAdd(&unorm[0], s1_vals[0]);
 
-      __syncwarp();
+      //__syncwarp();
   }
 }
 
@@ -1189,11 +1163,11 @@ __global__ void kOptimizer32bit1State(T *g, T *p,
 
   float s1_vals[NUM_PER_THREAD];
 
-  typedef cub::BlockLoad<T, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
-  typedef cub::BlockStore<T, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> Store;
+  typedef hipcub::BlockLoad<T, TH, NUM_PER_THREAD, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> Load;
+  typedef hipcub::BlockStore<T, TH, NUM_PER_THREAD, hipcub::BLOCK_STORE_WARP_TRANSPOSE> Store;
 
-  typedef cub::BlockLoad<float, TH, NUM_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
-  typedef cub::BlockStore<float, TH, NUM_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
+  typedef hipcub::BlockLoad<float, TH, NUM_PER_THREAD, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
+  typedef hipcub::BlockStore<float, TH, NUM_PER_THREAD, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreFloat;
 
   __shared__ union {
       typename Load::TempStorage load;
@@ -1289,8 +1263,8 @@ kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned c
     unsigned char m_c1[NUM8BIT];
     unsigned char r_c2[NUM8BIT];
 
-    typedef cub::BlockLoad<T, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
+    typedef hipcub::BlockLoad<T, NUM_THREADS, NUM8BIT, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
     typedef hipcub::BlockReduce<float, NUM_THREADS> BlockReduce;
 
 
@@ -1418,11 +1392,11 @@ kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned cha
     unsigned char c2s[NUM_PER_THREAD2];
     T p_vals[NUM_PER_THREAD2];
     T g_vals[NUM_PER_THREAD2];
-    typedef cub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+    typedef hipcub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
 
-    typedef cub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
-    typedef cub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+    typedef hipcub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef hipcub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
     __shared__ float smem_quantiles1[256];
     __shared__ float smem_quantiles2[256];
@@ -1526,8 +1500,8 @@ kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned c
     T g_vals[NUM8BIT];
     unsigned char m_c1[NUM8BIT];
 
-    typedef cub::BlockLoad<T, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
+    typedef hipcub::BlockLoad<T, NUM_THREADS, NUM8BIT, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, NUM_THREADS, NUM8BIT, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadUInt8;
     typedef hipcub::BlockReduce<float, NUM_THREADS> BlockReduce;
 
 
@@ -1625,11 +1599,11 @@ kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
     unsigned char c1s[NUM_PER_THREAD2];
     T p_vals[NUM_PER_THREAD2];
     T g_vals[NUM_PER_THREAD2];
-    typedef cub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+    typedef hipcub::BlockLoad<T, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
 
-    typedef cub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
-    typedef cub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+    typedef hipcub::BlockStore<unsigned char, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef hipcub::BlockStore<T, NUM_THREADS2, NUM_PER_THREAD2, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
     __shared__ float smem_quantiles1[256];
 
@@ -1723,7 +1697,7 @@ __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int st
   int valid_items = 0;
 
   typedef hipcub::BlockReduce<float, BLOCK_SIZE/NUM_VALS> BlockReduce;
-  typedef cub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+  typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_VALS, NUM_VALS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
 
   __shared__ typename BlockReduce::TempStorage reduce;
 
@@ -1796,11 +1770,11 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
     unsigned char c2s[N_PER_TH];
     T g_vals[N_PER_TH];
     T p_vals[N_PER_TH];
-    typedef cub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+    typedef hipcub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
 
-    typedef cub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
-    typedef cub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+    typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef hipcub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
     __shared__ float smem_quantiles1[LANES][257];
     __shared__ float smem_quantiles2[LANES][257];
@@ -1978,11 +1952,11 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
     T g_vals[N_PER_TH];
 		T p_vals[N_PER_TH];
 
-    typedef cub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
-    typedef cub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+    typedef hipcub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
+    typedef hipcub::BlockLoad<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
 
-    typedef cub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
-    typedef cub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+    typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
+    typedef hipcub::BlockStore<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
     __shared__ float smem_quantiles1[LANES][257];
     typedef hipcub::BlockReduce<float, BLOCK_SIZE/N_PER_TH> BlockReduce1;
@@ -2153,10 +2127,10 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   const int base_idx = (base_row*cols) + base_col;
   const int items_per_load = ITEMS_PER_THREAD*THREADS;
 
-  typedef cub::BlockLoad<T, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE> LoadT;
+  typedef hipcub::BlockLoad<T, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_VECTORIZE> LoadT;
   typedef hipcub::BlockReduce<float, THREADS> BlockRowReduce;
   typedef hipcub::BlockReduce<int, THREADS> BlockRowSum;
-  typedef cub::BlockExchange<float, THREADS, ITEMS_PER_THREAD> BlockExchange;
+  typedef hipcub::BlockExchange<float, THREADS, ITEMS_PER_THREAD> BlockExchange;
 
   __shared__ union {
     typename BlockExchange::TempStorage exchange;
@@ -2342,8 +2316,8 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
   float local_rowStats[ITEMS_PER_THREAD];
   __shared__ float smem_rowStats[SUBTILE_ROWS];
 
-  typedef cub::BlockLoad<int, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> LoadInt32;
-  typedef cub::BlockExchange<int, THREADS, ITEMS_PER_THREAD> ExchangeInt32;
+  typedef hipcub::BlockLoad<int, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_DIRECT> LoadInt32;
+  typedef hipcub::BlockExchange<int, THREADS, ITEMS_PER_THREAD> ExchangeInt32;
   __shared__ typename LoadInt32::TempStorage loadint32;
   __shared__ typename ExchangeInt32::TempStorage exchangeint32;
 
@@ -2430,9 +2404,9 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int S
   const int base_idx = (base_row*cols) + base_col;
   const int items_per_load = ITEMS_PER_THREAD*THREADS;
 
-  typedef cub::BlockLoad<half, THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE> LoadHalf;
+  typedef hipcub::BlockLoad<half, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_VECTORIZE> LoadHalf;
   __shared__ typename LoadHalf::TempStorage loadhalf;
-  typedef cub::BlockStore<char, THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_VECTORIZE> StoreInt8;
+  typedef hipcub::BlockStore<char, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_STORE_VECTORIZE> StoreInt8;
   __shared__ typename StoreInt8::TempStorage storeint8;
 
   __shared__ float smem_row_stats[TILE_ROWS];
@@ -2565,7 +2539,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
   // so that we can have contiguous stores
   __shared__ char smem_data[32*33*ITEMS_PER_THREAD];
   char local_data[ITEMS_PER_THREAD];
-  typedef cub::BlockExchange<char, THREADS, ITEMS_PER_THREAD> BlockExchange;
+  typedef hipcub::BlockExchange<char, THREADS, ITEMS_PER_THREAD> BlockExchange;
 
   // we load row after row from the base_position
   // Load data row by row
@@ -3059,7 +3033,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 2. Load k x k into registers
 //// 3. dequantize and store in second pair of k x k
 //// 4. matmul
-//// 5. sum with cub
+//// 5. sum with hipcub
 //// 6. store outputs
 //// TC kernel
 //// use k warps per thread block
@@ -3475,9 +3449,9 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 //// 2. Dequantize B
 //// 3. Fetch data from A and multiply
 //
-//  typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
+//  typedef hipcub::BlockLoad<T, THREADS , ITEMS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
 //  //__shared__ typename LoadA::TempStorage loada;
-//  typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadB;
+//  typedef hipcub::BlockLoad<T, THREADS , ITEMS, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadB;
 //  //__shared__ typename LoadB::TempStorage loadb;
 //  typedef hipcub::BlockReduce<T, THREADS> BlockReduce;
 //  // Allocate shared memory for BlockReduce
@@ -3672,7 +3646,7 @@ MAKE_PreconditionOptimizer32bit1State(RMSPROP, half)
 MAKE_PreconditionOptimizer32bit1State(RMSPROP, float)
 MAKE_PreconditionOptimizer32bit1State(LION, half)
 MAKE_PreconditionOptimizer32bit1State(LION, float)
-MAKE_PreconditionOptimizer32bit1State(LION, __nv_bfloat16)
+//MAKE_PreconditionOptimizer32bit1State(LION, hip_bfloat16)
 MAKE_PreconditionOptimizer32bit1State(ADAGRAD, half)
 MAKE_PreconditionOptimizer32bit1State(ADAGRAD, float)
 
@@ -3686,7 +3660,7 @@ MAKE_Optimizer32bit1State(RMSPROP, half)
 MAKE_Optimizer32bit1State(RMSPROP, float)
 MAKE_Optimizer32bit1State(LION, half)
 MAKE_Optimizer32bit1State(LION, float)
-MAKE_Optimizer32bit1State(LION, __nv_bfloat16)
+//MAKE_Optimizer32bit1State(LION, hip_bfloat16)
 MAKE_Optimizer32bit1State(ADAGRAD, half)
 MAKE_Optimizer32bit1State(ADAGRAD, float)
 
@@ -3698,14 +3672,16 @@ template __global__ void kPreconditionOptimizer32bit2State<gtype, oname, 4096, 8
 
 MAKE_PreconditionOptimizer32bit2State(ADAM, float)
 MAKE_PreconditionOptimizer32bit2State(ADAM, half)
-MAKE_PreconditionOptimizer32bit2State(ADAM, __nv_bfloat16)
+//MAKE_PreconditionOptimizer32bit2State(ADAM, hip_bfloat16)
 
 template __global__ void kOptimizer32bit2State<float, ADAM>(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 template __global__ void kOptimizer32bit2State<half, ADAM>(half* g, half* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
-template __global__ void kOptimizer32bit2State<__nv_bfloat16, ADAM>(__nv_bfloat16* g, __nv_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+/*
+template __global__ void kOptimizer32bit2State<hip_bfloat16, ADAM>(hip_bfloat16* g, hip_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+*/
 
 #define MAKE_PreconditionStatic8bit1State(oname, gtype) \
 template __global__ void kPreconditionOptimizerStatic8bit1State<gtype, oname>(gtype* p, gtype* __restrict__ const g, unsigned char*__restrict__  const state1,  \
@@ -3782,46 +3758,63 @@ MAKE_kQuantizeBlockwise(half,  4096, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,  4096, 4, 1, General8bit)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, General8bit)
+/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,    64, 2, 0, General8bit)
+*/
+
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
+/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
+*/
+
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, FP4)
+/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,    64, 2, 0, FP4)
+*/
+
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4)
+/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
+*/
+
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, NF4)
+/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,    64, 2, 0, NF4)
+*/
+
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4)
+/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,   64, 2, 0, NF4)
+*/
 
 template __global__ void kDequantizeBlockwise<half, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
 template __global__ void kDequantizeBlockwise<float, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
@@ -3841,7 +3834,7 @@ template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block
 
 MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, float, 2048, 8)
 MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, half, 2048, 8)
-MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, __nv_bfloat16, 2048, 8)
+//MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, hip_bfloat16, 2048, 8)
 
 
 #define MAKE_OptimizerStatic8bit1StateBlockwise(oname, gtype, block_size, num_per_thread) \
@@ -3860,6 +3853,6 @@ MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, half, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(LION, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(LION, half, 2048, 8)
-MAKE_OptimizerStatic8bit1StateBlockwise(LION, __nv_bfloat16, 2048, 8)
+//MAKE_OptimizerStatic8bit1StateBlockwise(LION, hip_bfloat16, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, half, 2048, 8)
diff --git a/csrc/kernels_hip.cuh b/csrc/kernels.hiph
similarity index 99%
rename from csrc/kernels_hip.cuh
rename to csrc/kernels.hiph
index 90a8cf6e9..1abae60d4 100644
--- a/csrc/kernels_hip.cuh
+++ b/csrc/kernels.hiph
@@ -6,7 +6,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <float.h>
-#include <ops.cuh>
+#include <ops.hiph>
 
 #ifndef kernels
 #define kernels
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 3606fadc9..84019aabd 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -5,9 +5,14 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <ops.cuh>
-#include <kernels.cuh>
+#include <ops.hiph>
+#include <kernels.hiph>
 #include <hipcub/hipcub.hpp>
+#include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#ifndef NO_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#endif
 #include <limits>
 #include <BinSearch.h>
 #include <cassert>
@@ -247,19 +252,19 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in
   const int fbeta = 0;
   const void * alpha = &falpha;
   const void * beta = &fbeta;
-	rocblas_status status;
+	hipblasStatus_t status;
 
-			status = rocblas_gemmex(context->m_handle,
-					transposeA ? rocblas_operation_transpose : rocblas_operation_none,
-					transposeB ? rocblas_operation_transpose : rocblas_operation_none,
+			status = hipblasGemmEx(context->m_handle,
+					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
 					m, n,	k,
-					alpha, A, HIP_R_8I, lda, B, HIP_R_8I, ldb, beta,
-					C, HIP_R_32I, ldc,
-          HIP_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+					alpha, A, HIPBLAS_R_8I, lda, B, HIPBLAS_R_8I, ldb, beta,
+					C, HIPBLAS_R_32I, ldc,
+          HIPBLAS_R_32I, HIPBLAS_GEMM_DEFAULT);
 
-    if (status != rocblas_status_success)
+    if (status != HIPBLAS_STATUS_SUCCESS)
     {
-      std::cout << "CUBLAS ERROR: Status " << status << std::endl;
+      std::cout << "HIPBLAS ERROR: Status " << status << std::endl;
     }
 
 }
@@ -271,7 +276,7 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
   const int fbeta = 0;
   const void * alpha = &falpha;
   const void * beta = &fbeta;
-	rocblas_status status;
+	hipblasStatus_t status;
 
   //cout << transposeA << transposeB << endl;
   //printf("%i %i %i\n", m,n,k);
@@ -279,17 +284,17 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
   //printf("%i %i %i\n", strideA, strideB, strideC);
   //printf("%i\n", batchCount);
 
-			status = cublasGemmStridedBatchedEx(context->m_handle,
-					transposeA ? rocblas_operation_transpose : rocblas_operation_none,
-					transposeB ? rocblas_operation_transpose : rocblas_operation_none,
+			status = hipblasGemmStridedBatchedEx(context->m_handle,
+					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
+					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
 					m, n,	k,
-					alpha, A, HIP_R_8I, lda, (long long int)strideA, B, HIP_R_8I, ldb, (long long int)strideB, beta,
-					C, HIP_R_32I, ldc, (long long int)strideC, batchCount,
-          HIP_R_32I, CUBLAS_GEMM_DEFAULT);
+					alpha, A, HIPBLAS_R_8I, lda, (long long int)strideA, B, HIPBLAS_R_8I, ldb, (long long int)strideB, beta,
+					C, HIPBLAS_R_32I, ldc, (long long int)strideC, batchCount,
+          HIPBLAS_R_32I, HIPBLAS_GEMM_DEFAULT);
 
-    if (status != rocblas_status_success)
+    if (status != HIPBLAS_STATUS_SUCCESS)
     {
-      std::cout << "CUBLAS ERROR: Status " << status << std::endl;
+      std::cout << "HIPBLAS ERROR: Status " << status << std::endl;
     }
 
 }
@@ -299,39 +304,38 @@ int roundoff(int v, int d) {
 }
 
 
-#ifdef NO_CUBLASLT
-#else
-template<int ORDER> cublasLtOrder_t get_order()
+#ifndef NO_HIPBLASLT
+template<int ORDER> hipblasLtOrder_t get_order()
 {
 	switch(ORDER)
 	{
 		case ROW:
-      return CUBLASLT_ORDER_ROW;
+      return hipblasLt_ORDER_ROW;
 			break;
     case COL:
-      return CUBLASLT_ORDER_COL;
+      return hipblasLt_ORDER_COL;
       break;
     case COL32:
-      return CUBLASLT_ORDER_COL32;
+      return hipblasLt_ORDER_COL32;
       break;
     case COL_TURING:
-      return CUBLASLT_ORDER_COL4_4R2_8C;
+      return hipblasLt_ORDER_COL4_4R2_8C;
       break;
     case COL_AMPERE:
-      return CUBLASLT_ORDER_COL32_2R_4R4;
+      return hipblasLt_ORDER_COL32_2R_4R4;
       break;
 		default:
 			break;
   }
 
-	return CUBLASLT_ORDER_ROW;
+	return hipblasLt_ORDER_ROW;
 }
 
-template cublasLtOrder_t get_order<ROW>();
-template cublasLtOrder_t get_order<COL>();
-template cublasLtOrder_t get_order<COL32>();
-template cublasLtOrder_t get_order<COL_TURING>();
-template cublasLtOrder_t get_order<COL_AMPERE>();
+template hipblasLtOrder_t get_order<ROW>();
+template hipblasLtOrder_t get_order<COL>();
+template hipblasLtOrder_t get_order<COL32>();
+template hipblasLtOrder_t get_order<COL_TURING>();
+template hipblasLtOrder_t get_order<COL_AMPERE>();
 #endif
 
 
@@ -366,61 +370,61 @@ template int get_leading_dim<ROW>(int dim1, int dim2);
 template int get_leading_dim<COL>(int dim1, int dim2);
 template int get_leading_dim<COL32>(int dim1, int dim2);
 
-template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2)
+#ifndef NO_HIPBLASLT
+template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(hipblasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2)
 {
-#ifdef NO_CUBLASLT
-#else
-  cublasLtOrder_t orderA = get_order<SRC>();
-  cublasLtOrder_t orderOut = get_order<TARGET>();
+  hipblasLtOrder_t orderA = get_order<SRC>();
+  hipblasLtOrder_t orderOut = get_order<TARGET>();
   int ldA = get_leading_dim<SRC>(dim1, dim2);
   int ldOut = get_leading_dim<TARGET>(dim1, dim2);
 
-  cublasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
-  cublasLtMatrixTransformDesc_t A2Out_desc = NULL;
+  hipblasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
+  hipblasLtMatrixTransformDesc_t A2Out_desc = NULL;
   rocblas_operation opTranspose = rocblas_operation_transpose;
   float transformAlpha = 1.0f, transformBeta = 0.0f;
 
 
   if(DTYPE == 8)
   {
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&A_desc, HIP_R_8I, dim1, dim2, ldA));
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim1, dim2, ldOut));
+    checkCublasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLAS_R_8I, dim1, dim2, ldA));
+    checkCublasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLAS_R_8I, dim1, dim2, ldOut));
   }
   else if(DTYPE == 32)
   {
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&A_desc, HIP_R_32I, dim1, dim2, ldA));
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim1, dim2, ldOut));
+    checkCublasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLAS_R_32I, dim1, dim2, ldA));
+    checkCublasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLAS_R_32I, dim1, dim2, ldOut));
   }
   else
   {
     printf("ERROR WRONG TYPE FOR TRANSFORM: %i\n", DTYPE);
   }
 
-  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(A_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
-  checkCublasStatus(cublasLtMatrixLayoutSetAttribute(out_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
+  checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(A_desc, hipblasLt_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
+  checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(out_desc, hipblasLt_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
 
-  checkCublasStatus(cublasLtMatrixTransformDescCreate(&A2Out_desc, HIP_R_32F));
+  checkCublasStatus(hipblasLtMatrixTransformDescCreate(&A2Out_desc, HIPBLAS_R_32F));
 
-  if(transpose){ checkCublasStatus(cublasLtMatrixTransformDescSetAttribute(A2Out_desc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
+  if(transpose){ checkCublasStatus(hipblasLtMatrixTransformDescSetAttribute(A2Out_desc, hipblasLt_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
 
-  checkCublasStatus(cublasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
+  checkCublasStatus(hipblasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
 
-  if (A_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(A_desc));
-  if (out_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(out_desc));
-  if (A2Out_desc) checkCublasStatus(cublasLtMatrixTransformDescDestroy(A2Out_desc));
-#endif
+  if (A_desc) checkCublasStatus(hipblasLtMatrixLayoutDestroy(A_desc));
+  if (out_desc) checkCublasStatus(hipblasLtMatrixLayoutDestroy(out_desc));
+  if (A2Out_desc) checkCublasStatus(hipblasLtMatrixTransformDescDestroy(A2Out_desc));
 }
 
-template void transform<int8_t, ROW, COL, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int8_t, ROW, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int8_t, ROW, COL32, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int32_t, ROW, COL32, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
-template void transform<int8_t, ROW, COL_TURING, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int8_t, ROW, COL_AMPERE, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int8_t, COL32, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
-template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL32, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, ROW, COL32, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL_TURING, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL_AMPERE, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, COL32, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, COL32, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+#endif
 
-template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+#ifndef NO_HIPBLASLT
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 {
 #ifdef NO_CUBLASLT
   cout << "" << endl;
@@ -433,61 +437,62 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle
 	return 0;
 #else
     int has_error = 0;
-    cublasLtMatmulDesc_t matmulDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    hipblasLtMatmulDesc_t matmulDesc = NULL;
+    hipblasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
     rocblas_operation opT = rocblas_operation_transpose;
-    cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-    cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
-    cublasLtOrder_t col_turing = CUBLASLT_ORDER_COL4_4R2_8C;
-    cublasLtOrder_t col_ampere = CUBLASLT_ORDER_COL32_2R_4R4;
+    hipblasLtPointerMode_t alphaVec = hipblasLt_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    hipblasLtOrder_t col32 = hipblasLt_ORDER_COL32;
+    hipblasLtOrder_t col_turing = hipblasLt_ORDER_COL4_4R2_8C;
+    hipblasLtOrder_t col_ampere = hipblasLt_ORDER_COL32_2R_4R4;
 
-    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, HIP_R_8I, m, k, lda));
-    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, HIP_R_8I, n, k, ldb));
+    has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIPBLAS_R_8I, m, k, lda));
+    has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIPBLAS_R_8I, n, k, ldb));
 
-    has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+    has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
     if(FORMATB == COL_TURING)
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
     else
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
 
     if(DTYPE_OUT == 32)
     {
-      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, HIP_R_32I));
-      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, HIP_R_32I, m, n, ldc));
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      has_error |= checkCublasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIPBLAS_R_32I));
+      has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLAS_R_32I, m, n, ldc));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       int alpha = 1, beta = 0;
-      has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+      has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
     }
     else
     {
-      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, HIP_R_32F));
-      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, HIP_R_8I, m, n, ldc));
-      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      has_error |= checkCublasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIPBLAS_R_32F));
+      has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLAS_R_8I, m, n, ldc));
+      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       if(!SCALE_ROWS)
       {
         float alpha = 1.0f, beta = 0.0f;
-        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+        has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
       }
       else
       {
-        has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
-        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+        has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
+        has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
       }
     }
 
 
-    if (Cdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc));
-    if (Bdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc));
-    if (Adesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc));
-    if (matmulDesc) has_error |= checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc));
+    if (Cdesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Cdesc));
+    if (Bdesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Bdesc));
+    if (Adesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
+    if (matmulDesc) has_error |= checkCublasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
     if(has_error == 1)
       printf("error detected");
 
     return has_error;
 #endif
 }
+#endif
 
 int fill_up_to_nearest_multiple(int value, int multiple)
 {
@@ -601,57 +606,52 @@ template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *o
 
 void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
 {
+  hipsparseSpMatDescr_t descA;
+  hipsparseDnMatDescr_t descB, descC;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  void *dBuffer = NULL;
+  size_t bufferSize = 0;
+
+  CHECK_HIPSPARSE( hipsparseCreateCoo(&descA, A_rows, A_cols, A_nnz,
+                                    A_rowidx, A_colidx, A_vals,
+                                    HIPSPARSE_INDEX_32I,
+                                    HIPSPARSE_INDEX_BASE_ZERO, HIP_R_16F) );
+  // Create dense matrix C
+  CHECK_HIPSPARSE( hipsparseCreateDnMat(&descC, A_rows, B_cols, ldc, C,
+                                      HIP_R_16F, HIPSPARSE_ORDER_ROW) );
+  // Create dense matrix B
+  if(transposed_B)
+  {
+    int tmp = A_cols;
+    A_cols = B_cols;
+    B_cols = tmp;
+  }
 
-#ifdef NO_CUBLASLT
-#else
-
-    hipsparseSpMatDescr_t descA;
-    hipsparseDnMatDescr_t descB, descC;
-
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    void *dBuffer = NULL;
-    size_t bufferSize = 0;
-
-    CHECK_CUSPARSE( hipsparseCreateCoo(&descA, A_rows, A_cols, A_nnz,
-                                      A_rowidx, A_colidx, A_vals,
-                                      HIPSPARSE_INDEX_32I,
-                                      HIPSPARSE_INDEX_BASE_ZERO, HIP_R_16F) );
-    // Create dense matrix C
-    CHECK_CUSPARSE( hipsparseCreateDnMat(&descC, A_rows, B_cols, ldc, C,
-                                        HIP_R_16F, CUSPARSE_ORDER_ROW) );
-    // Create dense matrix B
-    if(transposed_B)
-    {
-      int tmp = A_cols;
-      A_cols = B_cols;
-      B_cols = tmp;
-    }
-
-    CHECK_CUSPARSE( hipsparseCreateDnMat(&descB, A_cols, B_cols, ldb, B,
-                                        HIP_R_16F, CUSPARSE_ORDER_ROW) );
-    // allocate an external buffer if needed
-    CHECK_CUSPARSE( hipsparseSpMM_bufferSize(
-                                 handle,
-                                 HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                                 transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                                 &alpha, descA, descB, &beta, descC, HIP_R_32F,
-                                 CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) );
-    CUDA_CHECK_RETURN( hipMalloc(&dBuffer, bufferSize) );
-
-    // execute SpMM
-    CHECK_CUSPARSE( hipsparseSpMM(handle,
-                                 HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                                 transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                                 &alpha, descA, descB, &beta, descC, HIP_R_32F,
-                                 CUSPARSE_SPMM_ALG_DEFAULT, dBuffer));
-
-    // destroy matrix/vector descriptors
-    CHECK_CUSPARSE( hipsparseDestroySpMat(descA) );
-    CHECK_CUSPARSE( hipsparseDestroyDnMat(descB) );
-    CHECK_CUSPARSE( hipsparseDestroyDnMat(descC) );
-    CUDA_CHECK_RETURN( hipFree(dBuffer) );
-#endif
+  CHECK_HIPSPARSE( hipsparseCreateDnMat(&descB, A_cols, B_cols, ldb, B,
+                                      HIP_R_16F, HIPSPARSE_ORDER_ROW) );
+  // allocate an external buffer if needed
+  CHECK_HIPSPARSE( hipsparseSpMM_bufferSize(
+                                handle,
+                                HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                &alpha, descA, descB, &beta, descC, HIP_R_32F,
+                                HIPSPARSE_SPMM_ALG_DEFAULT, &bufferSize) );
+  CUDA_CHECK_RETURN( hipMalloc(&dBuffer, bufferSize) );
+
+  // execute SpMM
+  CHECK_HIPSPARSE( hipsparseSpMM(handle,
+                                HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                                &alpha, descA, descB, &beta, descC, HIP_R_32F,
+                                HIPSPARSE_SPMM_ALG_DEFAULT, dBuffer));
+
+  // destroy matrix/vector descriptors
+  CHECK_HIPSPARSE( hipsparseDestroySpMat(descA) );
+  CHECK_HIPSPARSE( hipsparseDestroyDnMat(descB) );
+  CHECK_HIPSPARSE( hipsparseDestroyDnMat(descC) );
+  CUDA_CHECK_RETURN( hipFree(dBuffer) );
 }
 
 template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
@@ -757,12 +757,14 @@ template void extractOutliers<COL_AMPERE>(char * A, int *idx, char *out, int idx
 template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 
-template int igemmlt<COL_TURING, 32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_TURING, 8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_TURING, 8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+#ifndef NO_HIPBLASLT
+template int igemmlt<COL_TURING, 32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_TURING, 8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_TURING, 8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<COL_AMPERE, 8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+#endif
 
 template void transformRowToFormat<COL32, 0>(char * A, char *out, int rows, int cols);
 template void transformRowToFormat<COL32, 1>(char * A, char *out, int rows, int cols);
@@ -797,14 +799,14 @@ template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
 
 MAKE_optimizer32bit(ADAM, half)
 MAKE_optimizer32bit(ADAM, float)
-MAKE_optimizer32bit(ADAM, __nv_bfloat16)
+MAKE_optimizer32bit(ADAM, hip_bfloat16)
 MAKE_optimizer32bit(MOMENTUM, half)
 MAKE_optimizer32bit(MOMENTUM, float)
 MAKE_optimizer32bit(RMSPROP, half)
 MAKE_optimizer32bit(RMSPROP, float)
 MAKE_optimizer32bit(LION, half)
 MAKE_optimizer32bit(LION, float)
-MAKE_optimizer32bit(LION, __nv_bfloat16)
+MAKE_optimizer32bit(LION, hip_bfloat16)
 MAKE_optimizer32bit(ADAGRAD, half)
 MAKE_optimizer32bit(ADAGRAD, float)
 
@@ -840,11 +842,11 @@ MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(half, LION);
 MAKE_optimizerStatic8bitBlockwise(float, LION);
-MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, LION);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
 MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
 MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
 
 template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
 template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
 
-MAKE_optimizerStatic8bitBlockwise(__nv_bfloat16, ADAM);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
diff --git a/csrc/ops_hip.cuh b/csrc/ops.hiph
similarity index 85%
rename from csrc/ops_hip.cuh
rename to csrc/ops.hiph
index cddc6d913..dc7bbffa5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops.hiph
@@ -16,14 +16,17 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_fp16.h>
 #include <rocblas/rocblas.h>
-#include <cublasLt.h>
+#ifndef NO_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#endif
 #include <hipsparse/hipsparse.h>
 #include <vector>
 #include <functional>
 
+/*
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
-
+*/
 
 
 #define CUDA_CHECK_RETURN(value) {                      \
@@ -36,28 +39,29 @@
 
 #define THREADS_PER_BLOCKS (512)
 
-#define CHECK_CUSPARSE(value) {                      \
-  hipsparseStatus_t _m_cudaStat = value;                    \
-  if (_m_cudaStat != HIPSPARSE_STATUS_SUCCESS) {                   \
-    fprintf(stderr, "Error %s at line %d in file %s\n",         \
-        cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
+#define CHECK_HIPSPARSE(value) {                      \
+  hipsparseStatus_t _m_hipStat = value;                    \
+  if (_m_hipStat != HIPSPARSE_STATUS_SUCCESS) {                   \
+    fprintf(stderr, "Error <sparse error> at line %d in file %s\n",         \
+        __LINE__, __FILE__);   \
     exit(1);                              \
   } }
 
 
+
 #define THREADS_PER_BLOCKS (512)
 
 
-inline void checkCudaStatus(hipError_t status) {
+inline void v(hipError_t status) {
     if (status != hipSuccess) {
-        printf("cuda API failed with status %d: %s\n", status, hipGetErrorString(status));
-        throw std::logic_error("cuda API failed");
+        printf("hip API failed with status %d: %s\n", status, hipGetErrorString(status));
+        throw std::logic_error("hip API failed");
     }
 }
 
-inline int checkCublasStatus(rocblas_status status) {
+inline int checkHiblasStatus(rocblas_status status) {
     if (status != rocblas_status_success) {
-        printf("cuBLAS API failed with status %d\n", status);
+        printf("hipBLAS API failed with status %d\n", status);
         //throw std::logic_error("cuBLAS API failed");
         return 1;
     }
@@ -116,26 +120,27 @@ class Context
 
 };
 
+#ifndef NO_HIPBLASLT
 class ContextLt
 {
     public:
-				cublasLtHandle_t m_handle;
+				hipblasLtHandle_t m_handle;
 
 				ContextLt()
 				{
-					cublasLtHandle_t handle;
-					cublasLtCreate(&handle);
+					hipblasLtHandle_t handle;
+					rocblasLtCreate(&handle);
 					m_handle = handle;
 				}
-
 };
+#endif
 
-class ContextCusparse
+class ContextHipsparse
 {
     public:
 				hipsparseHandle_t m_handle;
 
-				ContextCusparse()
+				ContextHipsparse()
 				{
 					hipsparseHandle_t handle;
 					hipsparseCreate(&handle);
@@ -180,9 +185,12 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
                     long long int strideA, long long int strideB, long long int strideC, int batchCount);
 
 
-template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+#ifndef NO_HIPBLASLT
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+
+template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(hipblasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
+#endif
 
-template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
 void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
 void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols);
 void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols);
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 56b322e72..704d922cb 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -51,12 +51,12 @@ MAKE_FUNC32(momentum, MOMENTUM, float, 32)
 MAKE_FUNC32(momentum, MOMENTUM, half, 16)
 MAKE_FUNC32(adam, ADAM, float, fp32)
 MAKE_FUNC32(adam, ADAM, half, fp16)
-MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16)
+//MAKE_FUNC32(adam, ADAM, hip_bfloat16, bf16)
 MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
 MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
 MAKE_FUNC32(lion, LION, float, fp32)
 MAKE_FUNC32(lion, LION, half, fp16)
-MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16)
+//MAKE_FUNC32(lion, LION, hip_bfloat16, bf16)
 MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
 MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
 
@@ -96,10 +96,10 @@ MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
 MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+//MAKE_BLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
 MAKE_BLOCKWISE8(lion, LION, half, fp16)
 MAKE_BLOCKWISE8(lion, LION, float, fp32)
-MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+//MAKE_BLOCKWISE8(lion, LION, hip_bfloat16, bf16)
 
 
 void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
@@ -120,21 +120,21 @@ void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax,
 void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
 
 #ifndef NO_HIPBLASLT
-
 #if BUILD_CUDA
 #define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
 void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
 { \
 	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
 } \
-#endif
 
+#endif
 #if BUILD_HIP
 #define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
 void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(hipblasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
 { \
 	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
 } \
+
 #endif
 
 MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
@@ -206,6 +206,7 @@ void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_r
 
 void spmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
 { spmm_coo_very_sparse_naive<signed char, 8>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
+
 #endif
 
 extern "C"
@@ -239,14 +240,14 @@ extern "C"
 
 	MAKE_CFUNC32(adam, float, fp32)
 	MAKE_CFUNC32(adam, half, fp16)
-	MAKE_CFUNC32(adam, __nv_bfloat16, bf16)
+	//MAKE_CFUNC32(adam, hip_bfloat16, bf16)
 	MAKE_CFUNC32(momentum, float, 32)
 	MAKE_CFUNC32(momentum, half, 16)
 	MAKE_CFUNC32(rmsprop, float, 32)
 	MAKE_CFUNC32(rmsprop, half, 16)
 	MAKE_CFUNC32(lion, float, fp32)
 	MAKE_CFUNC32(lion, half, fp16)
-	MAKE_CFUNC32(lion, __nv_bfloat16, bf16)
+	//MAKE_CFUNC32(lion, hip_bfloat16, bf16)
 	MAKE_CFUNC32(adagrad, float, 32)
 	MAKE_CFUNC32(adagrad, half, 16)
 
@@ -286,10 +287,10 @@ extern "C"
 	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-	MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+	//MAKE_CBLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
 	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
 	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
-	MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+	//MAKE_CBLOCKWISE8(lion, LION, hip_bfloat16, bf16)
 
 	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
 	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
@@ -312,7 +313,6 @@ extern "C"
 
 
 #ifndef NO_HIPBLASLT
-
 #if BUILD_CUDA
 	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
@@ -339,9 +339,10 @@ extern "C"
 	{ \
 		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
 	} \
+	
 #endif
 
-#if BUILD_CUDA
+#if BUILD_HIP
 	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt_turing_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
 	//{ (hipblasLtHandle_t)context->m_handle; return 0; }
@@ -367,7 +368,7 @@ extern "C"
 	{ \
 		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((hipblasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
 	} \
-#endif
+
 
 #endif
 
@@ -380,7 +381,7 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
-
+#endif
 	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
 	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
 	void cget_col_row_stats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
diff --git a/csrc/pythonInterface_hip.c b/csrc/test_delete_later.c
similarity index 100%
rename from csrc/pythonInterface_hip.c
rename to csrc/test_delete_later.c
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index 3d2de1b35..347ec9c5e 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -227,8 +227,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 
 #endif
 
-        IVec<AVX, float> vlem = vz < vxm;
-        IVec<AVX, float> vlep = vz < vxp;
+        IVec<SSE, float> vlem = operator< (vz, vxm);
+        IVec<SSE, float> vlep = operator< (vz, vxp);
         ip = ip + vlem + vlep;
 
         ip.store(pr);
@@ -277,8 +277,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 //        FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1);
 
         IVec<AVX, double> i(u.vec);
-        IVec<AVX, double> vlem = vz < vxm;
-        IVec<AVX, double> vlep = vz < vxp;
+        IVec<SSE, float> vlem = operator< (vz, vxm);
+        IVec<SSE, float> vlep = operator< (vz, vxp);
         i = i + vlem + vlep;
         i.extractLo32s().store(pr);
     }

From fb780a0adb887c2aed624d108e5f0b66ab8ca33e Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Tue, 11 Jul 2023 20:09:03 -0500
Subject: [PATCH 006/234] fixed runtime error (low accuracy)

---
 bitsandbytes/__main__.py        |  2 +-
 bitsandbytes/cuda_setup/main.py |  9 +++++++--
 bitsandbytes/functional.py      |  8 ++++----
 check_bnb_install.py            |  2 ++
 csrc/kernels.hip                | 28 ++++++++++------------------
 csrc/ops.hip                    | 12 ++++++------
 6 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py
index a100b2919..1c60346be 100644
--- a/bitsandbytes/__main__.py
+++ b/bitsandbytes/__main__.py
@@ -103,7 +103,7 @@ def print_debug_info() -> None:
 print_header("OTHER")
 print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}")
 cuda = get_cuda_lib_handle()
-print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities(cuda)}")
+#print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities(cuda)}")
 print_header("")
 print_header("DEBUG INFO END")
 print_header("")
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index 295444f59..f5739111d 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -138,6 +138,8 @@ def run_cuda_setup(self):
                 self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
                 self.lib = ct.cdll.LoadLibrary(binary_path)
         except Exception as ex:
+            #debug
+            self.add_log_entry("Exception in run_cuda_setup: \n")
             self.add_log_entry(str(ex))
 
     def add_log_entry(self, msg, is_warning=False):
@@ -380,9 +382,12 @@ def evaluate_cuda_setup():
         print(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
               ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
         print('='*80)
-    if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None, None
-    if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None, None
+    if not torch.cuda.is_available(): 
+        return 'libbitsandbytes_cpu.so', None, None, None, None
+    if torch.version.hip: 
+        return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None, None
 
+    print("WHAT THE FUCK IS THIS")
 
     cuda_setup = CUDASetup.get_instance()
     cudart_path = determine_cuda_runtime_lib_path()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 0b9e1203f..226c9e51f 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -28,7 +28,7 @@ def prod(iterable):
 if COMPILED_WITH_CUDA:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {}
-    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16, lib.cadam32bit_grad_bf16)
+    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) #, lib.cadam32bit_grad_bf16)
     str2optimizer32bit["momentum"] = (
         lib.cmomentum32bit_grad_32,
         lib.cmomentum32bit_grad_16,
@@ -37,7 +37,7 @@ def prod(iterable):
         lib.crmsprop32bit_grad_32,
         lib.crmsprop32bit_grad_16,
     )
-    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16, lib.clion32bit_grad_bf16)
+    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) #, lib.clion32bit_grad_bf16)
     str2optimizer32bit["adagrad"] = (
         lib.cadagrad32bit_grad_32,
         lib.cadagrad32bit_grad_16,
@@ -73,7 +73,7 @@ def prod(iterable):
     str2optimizer8bit_blockwise["adam"] = (
         lib.cadam_8bit_blockwise_grad_fp32,
         lib.cadam_8bit_blockwise_grad_fp16,
-        lib.cadam_8bit_blockwise_grad_bf16,
+        #lib.cadam_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["momentum"] = (
         lib.cmomentum_8bit_blockwise_grad_fp32,
@@ -86,7 +86,7 @@ def prod(iterable):
     str2optimizer8bit_blockwise["lion"] = (
         lib.clion_8bit_blockwise_grad_fp32,
         lib.clion_8bit_blockwise_grad_fp16,
-        lib.clion_8bit_blockwise_grad_bf16,
+        #lib.clion_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["adagrad"] = (
         lib.cadagrad_8bit_blockwise_grad_fp32,
diff --git a/check_bnb_install.py b/check_bnb_install.py
index 77cd03ec4..e50afb0a1 100644
--- a/check_bnb_install.py
+++ b/check_bnb_install.py
@@ -8,6 +8,8 @@
 
 adam = bnb.optim.Adam([p])
 
+
+
 out = a*p
 loss = out.sum()
 loss.backward()
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 2bd72504a..019024014 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -3758,63 +3758,55 @@ MAKE_kQuantizeBlockwise(half,  4096, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,  4096, 4, 1, General8bit)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, General8bit)
-/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, General8bit)
-MAKE_kQuantizeBlockwise(half,    64, 2, 0, General8bit)
-*/
+//MAKE_kQuantizeBlockwise(half,    64, 2, 0, General8bit)
 
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
-/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, General8bit)
-MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
-*/
+//MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
 
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, FP4)
-/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, FP4)
-MAKE_kQuantizeBlockwise(half,    64, 2, 0, FP4)
-*/
+//MAKE_kQuantizeBlockwise(half,    64, 2, 0, FP4)
+
 
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4)
-/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, FP4)
-MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
-*/
+//MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
+
 
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, NF4)
-/*
 MAKE_kQuantizeBlockwise(half,   512, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,   256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, NF4)
-MAKE_kQuantizeBlockwise(half,    64, 2, 0, NF4)
-*/
+//MAKE_kQuantizeBlockwise(half,    64, 2, 0, NF4)
+
 
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(float, 1024, 4, 0, NF4)
-/*
 MAKE_kQuantizeBlockwise(float,  512, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,  256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, NF4)
-MAKE_kQuantizeBlockwise(float,   64, 2, 0, NF4)
-*/
+//MAKE_kQuantizeBlockwise(float,   64, 2, 0, NF4)
+
 
 template __global__ void kDequantizeBlockwise<half, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
 template __global__ void kDequantizeBlockwise<float, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 84019aabd..9dcdcc012 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -74,8 +74,8 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
    hipLaunchKernelGGL(( kQuantizeBlockwise<T, 256, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 128)
    hipLaunchKernelGGL(( kQuantizeBlockwise<T, 128, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n);
-  else if(blocksize == 64)
-   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+  //else if(blocksize == 64)
+  // hipLaunchKernelGGL(( kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n);
 
 
   CUDA_CHECK_RETURN(hipPeekAtLastError());
@@ -799,14 +799,14 @@ template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
 
 MAKE_optimizer32bit(ADAM, half)
 MAKE_optimizer32bit(ADAM, float)
-MAKE_optimizer32bit(ADAM, hip_bfloat16)
+//MAKE_optimizer32bit(ADAM, hip_bfloat16)
 MAKE_optimizer32bit(MOMENTUM, half)
 MAKE_optimizer32bit(MOMENTUM, float)
 MAKE_optimizer32bit(RMSPROP, half)
 MAKE_optimizer32bit(RMSPROP, float)
 MAKE_optimizer32bit(LION, half)
 MAKE_optimizer32bit(LION, float)
-MAKE_optimizer32bit(LION, hip_bfloat16)
+//MAKE_optimizer32bit(LION, hip_bfloat16)
 MAKE_optimizer32bit(ADAGRAD, half)
 MAKE_optimizer32bit(ADAGRAD, float)
 
@@ -842,11 +842,11 @@ MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(half, LION);
 MAKE_optimizerStatic8bitBlockwise(float, LION);
-MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
+//MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
 MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
 MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
 
 template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
 template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
 
-MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
+//MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);

From 1048264bd55da24d0aa4a76e47ea01b0cee66d52 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Wed, 12 Jul 2023 00:47:00 -0500
Subject: [PATCH 007/234] FIX LOW ACCURACY

---
 bitsandbytes/archive_functional.py | 2397 ++++++++++++++++++++++++++++
 bitsandbytes/cuda_setup/main.py    |    1 -
 bitsandbytes/functional.py         |   26 +-
 bitsandbytes/nn/modules.py         |    3 +-
 4 files changed, 2413 insertions(+), 14 deletions(-)
 create mode 100644 bitsandbytes/archive_functional.py

diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
new file mode 100644
index 000000000..226c9e51f
--- /dev/null
+++ b/bitsandbytes/archive_functional.py
@@ -0,0 +1,2397 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import ctypes as ct
+import itertools
+import operator
+import random
+import torch
+import itertools
+import math
+from scipy.stats import norm
+import numpy as np
+
+from functools import reduce  # Required in Python 3
+from typing import Tuple
+from torch import Tensor
+
+from .cextension import COMPILED_WITH_CUDA, lib
+
+
+# math.prod not compatible with python < 3.8
+def prod(iterable):
+    return reduce(operator.mul, iterable, 1)
+
+name2qmap = {}
+
+if COMPILED_WITH_CUDA:
+    """C FUNCTIONS FOR OPTIMIZERS"""
+    str2optimizer32bit = {}
+    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) #, lib.cadam32bit_grad_bf16)
+    str2optimizer32bit["momentum"] = (
+        lib.cmomentum32bit_grad_32,
+        lib.cmomentum32bit_grad_16,
+    )
+    str2optimizer32bit["rmsprop"] = (
+        lib.crmsprop32bit_grad_32,
+        lib.crmsprop32bit_grad_16,
+    )
+    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) #, lib.clion32bit_grad_bf16)
+    str2optimizer32bit["adagrad"] = (
+        lib.cadagrad32bit_grad_32,
+        lib.cadagrad32bit_grad_16,
+    )
+
+    str2optimizer8bit = {}
+    str2optimizer8bit["adam"] = (
+        lib.cadam_static_8bit_grad_32,
+        lib.cadam_static_8bit_grad_16,
+    )
+    str2optimizer8bit["momentum"] = (
+        lib.cmomentum_static_8bit_grad_32,
+        lib.cmomentum_static_8bit_grad_16,
+    )
+    str2optimizer8bit["rmsprop"] = (
+        lib.crmsprop_static_8bit_grad_32,
+        lib.crmsprop_static_8bit_grad_16,
+    )
+    str2optimizer8bit["lion"] = (
+        lib.clion_static_8bit_grad_32,
+        lib.clion_static_8bit_grad_16,
+    )
+    str2optimizer8bit["lamb"] = (
+        lib.cadam_static_8bit_grad_32,
+        lib.cadam_static_8bit_grad_16,
+    )
+    str2optimizer8bit["lars"] = (
+        lib.cmomentum_static_8bit_grad_32,
+        lib.cmomentum_static_8bit_grad_16,
+    )
+
+    str2optimizer8bit_blockwise = {}
+    str2optimizer8bit_blockwise["adam"] = (
+        lib.cadam_8bit_blockwise_grad_fp32,
+        lib.cadam_8bit_blockwise_grad_fp16,
+        #lib.cadam_8bit_blockwise_grad_bf16,
+    )
+    str2optimizer8bit_blockwise["momentum"] = (
+        lib.cmomentum_8bit_blockwise_grad_fp32,
+        lib.cmomentum_8bit_blockwise_grad_fp16,
+    )
+    str2optimizer8bit_blockwise["rmsprop"] = (
+        lib.crmsprop_8bit_blockwise_grad_fp32,
+        lib.crmsprop_8bit_blockwise_grad_fp16,
+    )
+    str2optimizer8bit_blockwise["lion"] = (
+        lib.clion_8bit_blockwise_grad_fp32,
+        lib.clion_8bit_blockwise_grad_fp16,
+        #lib.clion_8bit_blockwise_grad_bf16,
+    )
+    str2optimizer8bit_blockwise["adagrad"] = (
+        lib.cadagrad_8bit_blockwise_grad_fp32,
+        lib.cadagrad_8bit_blockwise_grad_fp16,
+    )
+
+class GlobalPageManager:
+    _instance = None
+
+    def __init__(self):
+        raise RuntimeError("Call get_instance() instead")
+
+    def initialize(self):
+        self.paged_tensors = []
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+
+    def prefetch_all(self, to_cpu=False):
+        # assume the first added, will be hte
+        # ones that are used first, so swap them in last
+        # in the case they are evicted again
+        for t in self.paged_tensors[::-1]:
+            prefetch_tensor(t, to_cpu)
+
+
+
+class CUBLAS_Context:
+    _instance = None
+
+    def __init__(self):
+        raise RuntimeError("Call get_instance() instead")
+
+    def initialize(self):
+        self.context = {}
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+
+    def get_context(self, device):
+        if device.index not in self.context:
+            prev_device = torch.cuda.current_device()
+            torch.cuda.set_device(device)
+            self.context[device.index] = ct.c_void_p(lib.get_context())
+            torch.cuda.set_device(prev_device)
+        return self.context[device.index]
+
+
+class Cusparse_Context:
+    _instance = None
+
+    def __init__(self):
+        raise RuntimeError("Call get_instance() instead")
+
+    def initialize(self):
+        #self.context = ct.c_void_p(lib.get_cusparse())
+        if torch.version.cuda:
+            self.context = ct.c_void_p(lib.get_cusparse())
+        elif torch.version.hip:
+            self.context = ct.c_void_p(lib.get_hipsparse())
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+
+dtype2bytes = {}
+dtype2bytes[torch.float32] = 4
+dtype2bytes[torch.float16] = 2
+dtype2bytes[torch.bfloat16] = 2
+dtype2bytes[torch.uint8] = 1
+dtype2bytes[torch.int8] = 1
+
+def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)):
+    num_bytes = dtype2bytes[dtype]*prod(shape)
+    cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
+    c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
+    new_array = np.ctypeslib.as_array(c_ptr, shape=shape)
+    out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)).view(shape)
+    out.is_paged = True
+    out.page_deviceid = device.index
+    return out
+
+def prefetch_tensor(A, to_cpu=False):
+    assert A.is_paged, 'Only paged tensors can be prefetched!'
+    if to_cpu:
+        deviceid = -1
+    else:
+        deviceid = A.page_deviceid
+
+    num_bytes = dtype2bytes[A.dtype]*A.numel()
+    lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid))
+
+def elementwise_func(func_name, A, B, value, prefetch=True):
+    func = None
+    if A.dtype == torch.float32:
+        func = getattr(lib, f'c{func_name}_fp32', None)
+        cvalue = ct.c_float(value)
+    elif A.dtype == torch.uint8:
+        func = getattr(lib, f'c{func_name}_uint8', None)
+        cvalue = ct.c_uint8(value)
+
+    if func is None: raise NotImplementedError(f'Function not implemented: {func_name}')
+
+    is_managed = getattr(A, 'is_managed', False)
+    if is_managed and prefetch:
+        prefetch_tensor(A)
+        if B is not None: prefetch_tensor(B)
+
+    func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel()))
+    if A.is_paged or B.is_paged:
+        # paged function are fully asynchronous
+        # if we return from this function, we want to the tensor
+        # to be in the correct state, that is the final state after the
+        # operation occured. So we synchronize.
+        torch.cuda.synchronize()
+
+def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
+def arange(A, device=None): elementwise_func('arange', A, None, 0)
+def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0)
+
+
+def create_linear_map(signed=True, total_bits=8, add_zero=True):
+    sign = (-1.0 if signed else 0.0)
+    total_values = 2**total_bits
+    if add_zero or total_bits < 8:
+        # add a zero
+        # since we simulate less bits by having zeros in the data type, we
+        # we need to center the quantization around zero and as such lose
+        # a single value
+        total_values = (2**total_bits if not signed else 2**total_bits-1)
+
+    values = torch.linspace(sign, 1.0, total_values)
+    gap = 256 - values.numel()
+    if gap == 0:
+        return values
+    else:
+        l = values.numel()//2
+        return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist())
+
+def create_normal_map(offset=0.9677083, use_extra_value=True):
+
+    if use_extra_value:
+        # one more positive value, this is an asymmetric type
+        v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist()
+        v2 = [0]*(256-15) ## we have 15 non-zero values in this data type
+        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
+        v = v1 + v2 + v3
+    else:
+        v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist()
+        v2 = [0]*(256-14) ## we have 14 non-zero values in this data type
+        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
+        v = v1 + v2 + v3
+
+    values = torch.Tensor(v)
+    values = values.sort().values
+    values /= values.max()
+    assert values.numel() == 256
+    return values
+
+def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8):
+    e = exponent_bits
+    p = precision_bits
+    has_sign = 1 if signed else 0
+    assert e+p == total_bits-has_sign
+    # the exponent is biased to 2^(e-1) -1 == 0
+    evalues = []
+    pvalues = []
+    for i, val in enumerate(range(-((2**(exponent_bits-has_sign))), 2**(exponent_bits-has_sign), 1)):
+        evalues.append(2**val)
+
+
+    values = []
+    lst = list(itertools.product([0, 1], repeat=precision_bits))
+    #for ev in evalues:
+    bias = 2**(exponent_bits-1)
+    for evalue in range(2**(exponent_bits)):
+        for bit_pattern in lst:
+            value = (1 if evalue != 0 else 0)
+            for i, pval in enumerate(list(bit_pattern)):
+                value += pval*(2**-(i+1))
+            if evalue == 0:
+                # subnormals
+                value = value*2**-(bias)
+            else:
+                # normals
+                value = value*2**-(evalue-bias-1)
+            values.append(value)
+            if signed:
+                values.append(-value)
+
+
+    assert len(values) == 2**total_bits
+    values.sort()
+    if total_bits < 8:
+        gap = 256 - len(values)
+        for i in range(gap):
+            values.append(0)
+    values.sort()
+    code = torch.Tensor(values)
+    code /= code.max()
+
+    return code
+
+
+
+def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
+    """
+    Creates the dynamic quantiztion map.
+
+    The dynamic data type is made up of a dynamic exponent and
+    fraction. As the exponent increase from 0 to -7 the number
+    of bits available for the fraction shrinks.
+
+    This is a generalization of the dynamic type where a certain
+    number of the bits and be reserved for the linear quantization
+    region (the fraction). n determines the maximum number of
+    exponent bits.
+
+    For more details see
+    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
+    """
+
+    data = []
+    # these are additional items that come from the case
+    # where all the exponent bits are zero and no
+    # indicator bit is present
+    non_sign_bits = total_bits - (1 if signed else 0)
+    additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
+    if not signed:
+        additional_items = 2 * additional_items
+    for i in range(max_exponent_bits):
+        fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1))
+        boundaries = torch.linspace(0.1, 1, fraction_items)
+        means = (boundaries[:-1] + boundaries[1:]) / 2.0
+        data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+        if signed:
+            data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+
+        if additional_items > 0:
+            boundaries = torch.linspace(0.1, 1, additional_items + 1)
+            means = (boundaries[:-1] + boundaries[1:]) / 2.0
+            data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+            if signed:
+                data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+
+    data.append(0)
+    data.append(1.0)
+
+    gap = 256 - len(data)
+    for i in range(gap):
+        data.append(0)
+
+    data.sort()
+    return Tensor(data)
+
+def create_quantile_map(A, total_bits=8):
+    q = estimate_quantiles(A, num_quantiles=2**total_bits-1)
+    q = q.tolist()
+    q.append(0)
+
+    gap = 256 - len(q)
+    for i in range(gap):
+        q.append(0)
+
+    q.sort()
+
+    q = Tensor(q)
+    q = q/q.abs().max()
+    return q
+
+def get_special_format_str():
+    if not torch.cuda.is_available(): return 'col_turing'
+    major, _minor = torch.cuda.get_device_capability()
+    if major <= 7:
+        return "col_turing"
+    if major == 8:
+        return "col_ampere"
+    return "col_turing"
+
+
+
+def is_on_gpu(tensors):
+    on_gpu = True
+    gpu_ids = set()
+    for t in tensors:
+        if t is None: continue # NULL pointers are fine
+        is_paged = getattr(t, 'is_paged', False)
+        on_gpu &= (t.device.type == 'cuda' or is_paged)
+        if not is_paged:
+            gpu_ids.add(t.device.index)
+    if not on_gpu:
+        raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}')
+    if len(gpu_ids) > 1:
+        raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}')
+    return on_gpu
+
+def get_ptr(A: Tensor) -> ct.c_void_p:
+    """
+    Get the ctypes pointer from a PyTorch Tensor.
+
+    Parameters
+    ----------
+    A : torch.tensor
+        The PyTorch tensor.
+
+    Returns
+    -------
+    ctypes.c_void_p
+    """
+    if A is None:
+        return None
+    else:
+        return ct.c_void_p(A.data.data_ptr())
+
+
+def pre_call(device):
+    prev_device = torch.cuda.current_device()
+    torch.cuda.set_device(device)
+    return prev_device
+
+
+def post_call(prev_device):
+    torch.cuda.set_device(prev_device)
+
+
+def get_transform_func(dtype, orderA, orderOut, transpose=False):
+    name = f'ctransform_{(8 if dtype == torch.int8 else 32)}_{orderA}_to_{orderOut}_{"t" if transpose else "n"}'
+    if not hasattr(lib, name):
+        print(name)
+        raise ValueError(
+            f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}"
+        )
+    else:
+        return getattr(lib, name)
+
+
+def get_transform_buffer(
+    shape, dtype, device, to_order, from_order="row", transpose=False
+):
+    # init_func = torch.empty
+    init_func = torch.zeros
+    dims = len(shape)
+
+    if dims == 2:
+        rows = shape[0]
+    elif dims == 3:
+        rows = shape[0] * shape[1]
+    cols = shape[-1]
+
+    state = (shape, to_order)
+    if transpose:
+        # swap dims
+        tmp = rows
+        rows = cols
+        cols = tmp
+        state = (shape[::-1], to_order)
+
+    if to_order == "row" or to_order == "col":
+        return init_func(shape, dtype=dtype, device=device), state
+    elif to_order == "col32":
+        # blocks of 32 columns (padded)
+        cols = 32 * ((cols + 31) // 32)
+        return init_func((rows, cols), dtype=dtype, device=device), state
+    elif to_order == "col_turing":
+        # blocks of 32 columns and 8 rows
+        cols = 32 * ((cols + 31) // 32)
+        rows = 8 * ((rows + 7) // 8)
+        return init_func((rows, cols), dtype=dtype, device=device), state
+    elif to_order == "col_ampere":
+        # blocks of 32 columns and 32 rows
+        cols = 32 * ((cols + 31) // 32)
+        rows = 32 * ((rows + 31) // 32)
+        return init_func((rows, cols), dtype=dtype, device=device), state
+    else:
+        raise NotImplementedError(f"To_order not supported: {to_order}")
+
+
+def nvidia_transform(
+    A,
+    to_order,
+    from_order="row",
+    out=None,
+    transpose=False,
+    state=None,
+    ld=None,
+):
+    if state is None:
+        state = (A.shape, from_order)
+    else:
+        from_order = state[1]
+    if out is None:
+        out, new_state = get_transform_buffer(
+            state[0], A.dtype, A.device, to_order, state[1]
+        )
+    else:
+        new_state = (state[1], to_order)
+    func = get_transform_func(A.dtype, from_order, to_order, transpose)
+
+    shape = state[0]
+    if len(shape) == 2:
+        dim1 = ct.c_int32(shape[0])
+        dim2 = ct.c_int32(shape[1])
+    elif ld is not None:
+        n = prod(shape)
+        dim1 = prod([shape[i] for i in ld])
+        dim2 = ct.c_int32(n // dim1)
+        dim1 = ct.c_int32(dim1)
+    else:
+        dim1 = ct.c_int32(shape[0] * shape[1])
+        dim2 = ct.c_int32(shape[2])
+
+    ptr = CUBLAS_Context.get_instance().get_context(A.device)
+    func(ptr, get_ptr(A), get_ptr(out), dim1, dim2)
+
+    return out, new_state
+
+
+def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor:
+    '''
+    Estimates 256 equidistant quantiles on the input tensor eCDF.
+
+    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
+    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
+    and the extreme quantiles close to 0 and 1 have high variance / large estimation
+    errors. These large errors can be avoided by using the offset variable which trims
+    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
+    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
+    usually has a much lower error but is not a minimum entropy encoding. Given an offset
+    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor. Any shape.
+    out : torch.Tensor
+        Tensor with the 256 estimated quantiles.
+    offset : float
+        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
+    num_quantiles : int
+        The number of equally spaced quantiles.
+
+    Returns
+    -------
+    torch.Tensor:
+        The 256 quantiles in float32 datatype.
+    '''
+    if A.numel() < 256: raise NotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.')
+    if num_quantiles > 256: raise NotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}")
+    if num_quantiles < 256 and offset == 1/(512):
+        # override default arguments
+        offset = 1/(2*num_quantiles)
+
+    if out is None: out = torch.zeros((256,), dtype=torch.float32, device=A.device)
+    is_on_gpu([A, out])
+    device = pre_call(A.device)
+    if A.dtype == torch.float32:
+        lib.cestimate_quantiles_fp32(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
+    elif A.dtype == torch.float16:
+        lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
+    else:
+        raise NotImplementedError(f"Not supported data type {A.dtype}")
+    post_call(device)
+
+    if num_quantiles < 256:
+        step = round(256/num_quantiles)
+        idx = torch.linspace(0, 255, num_quantiles).long().to(A.device)
+        out = out[idx]
+
+    return out
+
+
+def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor:
+    """
+    Quantize tensor A in blocks of size 4096 values.
+
+    Quantizes tensor A by dividing it into blocks of 4096 values.
+    Then the absolute maximum value within these blocks is calculated
+    for the non-linear quantization.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    code : torch.Tensor
+        The quantization map.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor.
+    tuple(torch.Tensor, torch.Tensor):
+        The quantization state to undo the quantization.
+    """
+
+
+    if code is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+
+    if absmax is None:
+        n = A.numel()
+        blocks = n // blocksize
+        blocks += 1 if n % blocksize > 0 else 0
+        absmax = torch.zeros((blocks,), device=A.device)
+
+    if out is None:
+        out = torch.zeros_like(A, dtype=torch.uint8)
+
+    if A.device.type != 'cpu':
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        cblocksize = ct.c_int32(blocksize)
+        prev_device = pre_call(A.device)
+        code = code.to(A.device)
+        is_on_gpu([code, A, out, absmax])
+        if A.dtype == torch.float32:
+            lib.cquantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
+        elif A.dtype == torch.float16:
+            lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        post_call(A.device)
+    else:
+        # cpu
+        code = code.cpu()
+        lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+
+    if nested:
+        offset = absmax.mean()
+        absmax -= offset
+        qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
+        state = [qabsmax, code, blocksize, nested, offset, state2]
+    else:
+        state = [absmax, code, blocksize, nested, None, None]
+
+
+
+    return out, state
+
+
+def dequantize_blockwise(
+    A: Tensor,
+    quant_state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    code: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 4096,
+    nested=False
+) -> Tensor:
+    """
+    Dequantizes blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in
+    blocks of size 4096.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input 8-bit tensor.
+    quant_state : tuple(torch.Tensor, torch.Tensor)
+        Tuple of code and absmax values.
+    absmax : torch.Tensor
+        The absmax values.
+    code : torch.Tensor
+        The quantization map.
+    out : torch.Tensor
+        Dequantized output tensor (default: float32)
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor (default: float32)
+    """
+    assert quant_state is not None or absmax is not None
+    if code is None and quant_state is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+
+    if out is None:
+        out = torch.zeros_like(A, dtype=torch.float32)
+
+    if quant_state is None:
+       quant_state = (absmax, code, blocksize)
+       assert absmax is not None and out is not None
+    else:
+       absmax, code, blocksize, nested, offset, state2 = quant_state
+       if nested:
+           absmax = dequantize_blockwise(absmax, state2)
+           absmax += offset
+
+
+    if A.device.type != 'cpu':
+        device = pre_call(A.device)
+        code = code.to(A.device)
+        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+        is_on_gpu([A, absmax, out])
+        if out.dtype == torch.float32:
+            lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+        elif out.dtype == torch.float16:
+            lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        post_call(A.device)
+    else:
+        code = code.cpu()
+        lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+
+    return out
+
+def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
+
+def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
+
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    if A.device.type != 'cuda':
+        raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
+    if quant_type not in ['fp4', 'nf4']:
+        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+
+    n = A.numel()
+    input_shape = A.shape
+
+    if absmax is None:
+        blocks = n // blocksize
+        blocks += 1 if n % blocksize > 0 else 0
+        absmax = torch.zeros((blocks,), device=A.device)
+
+
+    if out is None:
+        out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
+
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+
+    prev_device = pre_call(A.device)
+    is_on_gpu([A, out, absmax])
+
+    if A.dtype == torch.float32:
+        if quant_type == 'fp4':
+            lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        else:
+            lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+    elif A.dtype == torch.float16:
+        if quant_type == 'fp4':
+            lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        else:
+            lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+    else:
+        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+    post_call(A.device)
+
+    if compress_statistics:
+        offset = absmax.mean()
+        absmax -= offset
+        #code = create_custom_map().to(absmax.device)
+        #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
+        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+        del absmax
+        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
+    else:
+        state = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
+
+    return out, state
+
+def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
+
+def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
+
+def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+    """
+    Dequantizes FP4 blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input 8-bit tensor (packed 4-bit values).
+    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
+        Tuple of absmax values, original tensor shape and original dtype.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        Dequantized output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor.
+    """
+    if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+    if quant_type not in ['fp4', 'nf4']:
+        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+
+    if quant_state is None:
+        assert absmax is not None and out is not None
+        shape = out.shape
+        dtype = out.dtype
+    else:
+        absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state
+
+
+    if compressed_stats is not None:
+        offset, state2 = compressed_stats
+        absmax = dequantize_blockwise(absmax, state2)
+        absmax += offset
+
+    if out is None:
+        out = torch.empty(shape, dtype=dtype, device=A.device)
+
+    n = out.numel()
+
+
+    device = pre_call(A.device)
+    is_on_gpu([A, absmax, out])
+    if out.dtype == torch.float32:
+        if quant_type == 'fp4':
+            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+        else:
+            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+    elif out.dtype == torch.float16:
+        if quant_type == 'fp4':
+            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+        else:
+            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+    else:
+        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+    post_call(A.device)
+
+    is_transposed = (True if A.shape[0] == 1 else False)
+    if is_transposed: return out.t()
+    else: return out
+
+
+def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
+    if code is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+        code = code.to(A.device)
+
+    absmax = torch.abs(A).max()
+    inp = A / absmax
+    out = quantize_no_absmax(inp, code, out)
+    return out, (absmax, code)
+
+
+def dequantize(
+    A: Tensor,
+    quant_state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    code: Tensor = None,
+    out: Tensor = None,
+) -> Tensor:
+    assert quant_state is not None or absmax is not None
+    if code is None and quant_state is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+        code = code.to(A.device)
+
+    if quant_state is None:
+        quant_state = (absmax, code)
+    out = dequantize_no_absmax(A, quant_state[1], out)
+    return out * quant_state[0]
+
+
+def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
+    '''
+    Quantizes input tensor to 8-bit.
+
+    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
+    `out` using the quantization map `code`.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    code : torch.Tensor
+        The quantization map.
+    out : torch.Tensor, optional
+        The output tensor. Needs to be of type byte.
+
+    Returns
+    -------
+    torch.Tensor:
+        Quantized 8-bit tensor.
+    '''
+    prev_device = pre_call(A.device)
+    if out is None: out = torch.zeros_like(A, dtype=torch.uint8)
+    is_on_gpu([A, out])
+    lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
+    post_call(prev_device)
+    return out
+
+
+def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
+    '''
+    Dequantizes the 8-bit tensor to 32-bit.
+
+    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
+    the quantization map `code`.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The 8-bit input tensor.
+    code : torch.Tensor
+        The quantization map.
+    out : torch.Tensor
+        The 32-bit output tensor.
+
+    Returns
+    -------
+    torch.Tensor:
+        32-bit output tensor.
+    '''
+    prev_device = pre_call(A.device)
+    if out is None: out = torch.zeros_like(A, dtype=torch.float32)
+    is_on_gpu([code, A, out])
+    lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
+    post_call(prev_device)
+    return out
+
+
+def optimizer_update_32bit(
+    optimizer_name: str,
+    g: Tensor,
+    p: Tensor,
+    state1: Tensor,
+    beta1: float,
+    eps: float,
+    step: int,
+    lr: float,
+    state2: Tensor = None,
+    beta2: float = 0.0,
+    weight_decay: float = 0.0,
+    gnorm_scale: float = 1.0,
+    unorm_vec: Tensor = None,
+    max_unorm: float = 0.0,
+    skip_zeros=False,
+) -> None:
+    """
+    Performs an inplace optimizer update with one or two optimizer states.
+
+    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
+
+    Parameters
+    ----------
+    optimizer_name : str
+        The name of the optimizer: {adam}.
+    g : torch.Tensor
+        Gradient tensor.
+    p : torch.Tensor
+        Parameter tensor.
+    state1 : torch.Tensor
+        Optimizer state 1.
+    beta1 : float
+        Optimizer beta1.
+    eps : float
+        Optimizer epsilon.
+    weight_decay : float
+        Weight decay.
+    step : int
+        Current optimizer step.
+    lr : float
+        The learning rate.
+    state2 : torch.Tensor
+        Optimizer state 2.
+    beta2 : float
+        Optimizer beta2.
+    gnorm_scale : float
+        The factor to rescale the gradient to the max clip value.
+    unorm_vec : torch.Tensor
+        The tensor for the update norm.
+    max_unorm : float
+        The maximum update norm relative to the weight norm.
+    skip_zeros : bool
+        Whether to skip zero-valued gradients or not (default: False).
+    """
+
+    param_norm = 0.0
+    if max_unorm > 0.0:
+        param_norm = torch.norm(p.data.float())
+
+
+    optim_func = None
+    if g.dtype == torch.float32:
+        optim_func = str2optimizer32bit[optimizer_name][0]
+    elif g.dtype == torch.float16:
+        optim_func = str2optimizer32bit[optimizer_name][1]
+    elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3):
+        optim_func = str2optimizer32bit[optimizer_name][2]
+    else:
+        raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}")
+
+    is_on_gpu([g, p, state1, state2, unorm_vec])
+    prev_device = pre_call(g.device)
+    optim_func(
+        get_ptr(g),
+        get_ptr(p),
+        get_ptr(state1),
+        get_ptr(state2),
+        get_ptr(unorm_vec),
+        ct.c_float(max_unorm),
+        ct.c_float(param_norm),
+        ct.c_float(beta1),
+        ct.c_float(beta2),
+        ct.c_float(eps),
+        ct.c_float(weight_decay),
+        ct.c_int32(step),
+        ct.c_float(lr),
+        ct.c_float(gnorm_scale),
+        ct.c_bool(skip_zeros),
+        ct.c_int32(g.numel()))
+    post_call(prev_device)
+
+
+def optimizer_update_8bit(
+    optimizer_name: str,
+    g: Tensor,
+    p: Tensor,
+    state1: Tensor,
+    state2: Tensor,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    lr: float,
+    qmap1: Tensor,
+    qmap2: Tensor,
+    max1: Tensor,
+    max2: Tensor,
+    new_max1: Tensor,
+    new_max2: Tensor,
+    weight_decay: float = 0.0,
+    gnorm_scale: float = 1.0,
+    unorm_vec: Tensor = None,
+    max_unorm: float = 0.0,
+) -> None:
+    """
+    Performs an inplace Adam update.
+
+    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
+    Uses AdamW formulation if weight decay > 0.0.
+
+    Parameters
+    ----------
+    optimizer_name : str
+        The name of the optimizer. Choices {adam, momentum}
+    g : torch.Tensor
+        Gradient tensor.
+    p : torch.Tensor
+        Parameter tensor.
+    state1 : torch.Tensor
+        Adam state 1.
+    state2 : torch.Tensor
+        Adam state 2.
+    beta1 : float
+        Adam beta1.
+    beta2 : float
+        Adam beta2.
+    eps : float
+        Adam epsilon.
+    weight_decay : float
+        Weight decay.
+    step : int
+        Current optimizer step.
+    lr : float
+        The learning rate.
+    qmap1 : torch.Tensor
+        Quantization map for first Adam state.
+    qmap2 : torch.Tensor
+        Quantization map for second Adam state.
+    max1 : torch.Tensor
+        Max value for first Adam state update.
+    max2 : torch.Tensor
+        Max value for second Adam state update.
+    new_max1 : torch.Tensor
+        Max value for the next Adam update of the first state.
+    new_max2 : torch.Tensor
+        Max value for the next Adam update of the second state.
+    gnorm_scale : float
+        The factor to rescale the gradient to the max clip value.
+    unorm_vec : torch.Tensor
+        The tensor for the update norm.
+    max_unorm : float
+        The maximum update norm relative to the weight norm.
+    """
+
+    param_norm = 0.0
+    if max_unorm > 0.0:
+        param_norm = torch.norm(p.data.float())
+
+    prev_device = pre_call(g.device)
+    is_on_gpu([g, p, state1, state2, unorm_vec, qmap1, qmap2, max1, max2, new_max1, new_max2])
+    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
+        str2optimizer8bit[optimizer_name][0](
+            get_ptr(p),
+            get_ptr(g),
+            get_ptr(state1),
+            get_ptr(state2),
+            get_ptr(unorm_vec),
+            ct.c_float(max_unorm),
+            ct.c_float(param_norm),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            get_ptr(qmap1),
+            get_ptr(qmap2),
+            get_ptr(max1),
+            get_ptr(max2),
+            get_ptr(new_max1),
+            get_ptr(new_max2),
+            ct.c_float(weight_decay),
+            ct.c_float(gnorm_scale),
+            ct.c_int32(g.numel()),
+        )
+    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
+        str2optimizer8bit[optimizer_name][1](
+            get_ptr(p),
+            get_ptr(g),
+            get_ptr(state1),
+            get_ptr(state2),
+            get_ptr(unorm_vec),
+            ct.c_float(max_unorm),
+            ct.c_float(param_norm),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            get_ptr(qmap1),
+            get_ptr(qmap2),
+            get_ptr(max1),
+            get_ptr(max2),
+            get_ptr(new_max1),
+            get_ptr(new_max2),
+            ct.c_float(weight_decay),
+            ct.c_float(gnorm_scale),
+            ct.c_int32(g.numel()),
+        )
+    else:
+        raise ValueError(
+            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
+        )
+    post_call(prev_device)
+
+
+def optimizer_update_8bit_blockwise(
+    optimizer_name: str,
+    g: Tensor,
+    p: Tensor,
+    state1: Tensor,
+    state2: Tensor,
+    beta1: float,
+    beta2: float,
+    eps: float,
+    step: int,
+    lr: float,
+    qmap1: Tensor,
+    qmap2: Tensor,
+    absmax1: Tensor,
+    absmax2: Tensor,
+    weight_decay: float = 0.0,
+    gnorm_scale: float = 1.0,
+    skip_zeros=False,
+) -> None:
+
+    optim_func = None
+    prev_device = pre_call(g.device)
+    is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
+    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
+        optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
+    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
+        optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
+    elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and
+          len(str2optimizer8bit_blockwise[optimizer_name])==3):
+        optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
+    else:
+        raise ValueError(
+            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
+        )
+    post_call(prev_device)
+
+    is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
+
+    prev_device = pre_call(g.device)
+    optim_func(
+        get_ptr(p),
+        get_ptr(g),
+        get_ptr(state1),
+        get_ptr(state2),
+        ct.c_float(beta1),
+        ct.c_float(beta2),
+        ct.c_float(eps),
+        ct.c_int32(step),
+        ct.c_float(lr),
+        get_ptr(qmap1),
+        get_ptr(qmap2),
+        get_ptr(absmax1),
+        get_ptr(absmax2),
+        ct.c_float(weight_decay),
+        ct.c_float(gnorm_scale),
+        ct.c_bool(skip_zeros),
+        ct.c_int32(g.numel()),
+    )
+    post_call(prev_device)
+
+def percentile_clipping(
+    grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5
+):
+    """Applies percentile clipping
+
+    grad: torch.Tensor
+        The gradient tensor.
+    gnorm_vec: torch.Tensor
+        Vector of gradient norms. 100 elements expected.
+    step: int
+        The current optimiation steps (number of past gradient norms).
+
+    """
+    prev_device = pre_call(grad.device)
+    is_on_gpu([grad, gnorm_vec])
+    if grad.dtype == torch.float32:
+        lib.cpercentile_clipping_g32(
+            get_ptr(grad),
+            get_ptr(gnorm_vec),
+            ct.c_int32(step),
+            ct.c_int32(grad.numel()),
+        )
+    elif grad.dtype == torch.float16:
+        lib.cpercentile_clipping_g16(
+            get_ptr(grad),
+            get_ptr(gnorm_vec),
+            ct.c_int32(step),
+            ct.c_int32(grad.numel()),
+        )
+    else:
+        raise ValueError(f"Gradient type {grad.dtype} not supported!")
+    post_call(prev_device)
+
+    current_gnorm = torch.sqrt(gnorm_vec[step % 100])
+    vals, idx = torch.sort(gnorm_vec)
+    clip_value = torch.sqrt(vals[percentile])
+    gnorm_scale = 1.0
+
+    if current_gnorm > clip_value:
+        gnorm_scale = clip_value / current_gnorm
+
+    return current_gnorm, clip_value, gnorm_scale
+
+
+def histogram_scatter_add_2d(
+    histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor
+):
+    assert len(histogram.shape) == 2
+    assert histogram.dtype == torch.float32
+    assert source.dtype == torch.float32
+    assert index1.dtype == torch.int32
+    assert index2.dtype == torch.int32
+
+    assert histogram.device.type == "cuda"
+    assert index1.device.type == "cuda"
+    assert index2.device.type == "cuda"
+    assert source.device.type == "cuda"
+
+    maxdim1 = ct.c_int32(histogram.shape[0])
+    n = ct.c_int32(index1.numel())
+    is_on_gpu([histogram, index1, index2, source])
+    lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
+
+def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
+    if not torch.cuda.is_initialized(): torch.cuda.init()
+    if A.dtype != expected_type or B.dtype != expected_type:
+        raise TypeError(
+            f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}"
+        )
+
+    sA = A.shape
+    sB = B.shape
+    tA = transposed_A
+    tB = transposed_B
+
+    correct = True
+
+    if len(sA) == 2 and len(sB) == 2:
+        if not tA and not tB and A.shape[1] != B.shape[0]:
+            correct = False
+        elif tA and not tB and A.shape[0] != B.shape[0]:
+            correct = False
+        elif tA and tB and A.shape[0] != B.shape[1]:
+            correct = False
+        elif not tA and tB and A.shape[1] != B.shape[1]:
+            correct = False
+    elif len(sA) == 3 and len(sB) == 2:
+        if not tA and not tB and A.shape[2] != B.shape[0]:
+            correct = False
+        elif tA and not tB and A.shape[1] != B.shape[0]:
+            correct = False
+        elif tA and tB and A.shape[1] != B.shape[1]:
+            correct = False
+        elif not tA and tB and A.shape[2] != B.shape[1]:
+            correct = False
+    elif len(sA) == 3 and len(sB) == 3:
+        if not tA and not tB and A.shape[2] != B.shape[1]:
+            correct = False
+        elif tA and not tB and A.shape[1] != B.shape[1]:
+            correct = False
+        elif tA and tB and A.shape[1] != B.shape[2]:
+            correct = False
+        elif not tA and tB and A.shape[2] != B.shape[2]:
+            correct = False
+
+    if out is not None:
+        sout = out.shape
+        # special case common in backprop
+        if not correct and len(sA) == 3 and len(sB) == 3:
+            if (
+                sout[0] == sA[2]
+                and sout[1] == sB[2]
+                and sA[0] == sB[0]
+                and sA[1] == sB[1]
+            ):
+                correct = True
+    else:
+        if len(sA) == 2 and len(sB) == 2:
+            if not tA and not tB:
+                sout = (sA[0], sB[1])
+            elif tA and tB:
+                sout = (sA[1], sB[0])
+            elif tA and not tB:
+                sout = (sA[1], sB[1])
+            elif not tA and tB:
+                sout = (sA[0], sB[0])
+        elif len(sA) == 3 and len(sB) == 2:
+            if not tA and not tB:
+                sout = (sA[0], sA[1], sB[1])
+            elif tA and tB:
+                sout = (sA[0], sA[2], sB[0])
+            elif tA and not tB:
+                sout = (sA[0], sA[2], sB[1])
+            elif not tA and tB:
+                sout = (sA[0], sA[1], sB[0])
+        elif len(sA) == 3 and len(sB) == 3:
+            if not tA and not tB:
+                sout = (sA[0], sA[1], sB[2])
+            elif tA and tB:
+                sout = (sA[0], sA[2], sB[1])
+            elif tA and not tB:
+                sout = (sA[0], sA[2], sB[2])
+            elif not tA and tB:
+                sout = (sA[0], sA[1], sB[1])
+
+    if not correct:
+        raise ValueError(
+            f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}."
+        )
+
+    return sout
+
+def cutlass3_gemm(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor = None,
+    transposed_A=False,
+    transposed_B=False,
+    state=None
+):
+    #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
+    if state is None:
+        Bshape = B.shape
+        bout = Bshape[1]
+    else:
+        Bshape = state[1]
+        bout = Bshape[0]
+    if out is None:
+        out = torch.zeros(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
+
+    sA = A.shape
+    sB = B.shape
+    if transposed_A and len(sA) == 2:
+        sA = (sA[1], sA[0])
+    elif transposed_A and len(sA) == 3:
+        sA = (sA[0], sA[2], sA[0])
+    if transposed_B and len(sB) == 2:
+        sB = (sB[1], sB[0])
+    elif transposed_B and len(sB) == 3:
+        sB = (sB[0], sB[2], sB[0])
+    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
+    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
+    # (transpose of row major is column major)
+    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
+
+    # matrices in the input arguments for cuBLAS
+    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
+    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
+    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
+    if len(sB) == 2:
+        if B.stride()[0] == B.shape[1]:
+            transposed_B = False
+        elif B.stride()[1] == B.shape[0]:
+            transposed_B = True
+        if len(A.shape) == 2:
+            if A.stride()[0] == A.shape[1]:
+                transposed_A = False
+            elif A.stride()[1] == A.shape[0]:
+                transposed_A = True
+        else:
+            if A.stride()[1] == A.shape[2]:
+                transposed_A = False
+            elif A.stride()[2] == A.shape[1]:
+                transposed_A = True
+
+        if len(sA) == 2:
+            n = sA[0]
+            ldb = A.stride()[1 if transposed_A else 0]
+        elif len(sA) == 3 and len(sB) == 2:
+            n = sA[0] * sA[1]
+            ldb = sA[2]
+
+        m = sB[1]
+        k = sB[0]
+        lda = B.stride()[0]
+        ldc = sB[1]
+    elif len(sB) == 3:
+        # special case
+        assert len(sA) == 3
+        if not (sA[0] == sB[0] and sA[1] == sB[1]):
+            raise ValueError(
+                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
+            )
+
+        transposed_A = True
+        transposed_B = False
+
+        m = sB[2]
+        n = sA[2]
+        k = sB[0] * sB[1]
+
+        lda = n
+        ldb = sA[2]
+        ldc = m
+
+    ptr = CUBLAS_Context.get_instance().get_context(A.device)
+
+    # B^T @ A^T = C^T
+    # [km, nk -> mn]
+    #lda = ldb = ldc = 1
+    #lda = 1
+    if state is not None:
+        m = Bshape[0]
+        k = Bshape[1]
+        lda = Bshape[0]
+        ldc = Bshape[0]
+        ldb = (ldb+1)//2
+    #print(m, n, k, lda, ldb, ldc)
+    is_on_gpu([B, A, out])
+    m = ct.c_int32(m)
+    n = ct.c_int32(n)
+    k = ct.c_int32(k)
+    lda = ct.c_int32(lda)
+    ldb = ct.c_int32(ldb)
+    ldc = ct.c_int32(ldc)
+
+    if B.dtype == torch.uint8:
+        lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+    elif A.dtype == torch.float32:
+        lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
+    elif A.dtype == torch.float16:
+        lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
+    else:
+        raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}')
+
+    return out
+
+
+
+
+def igemm(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor = None,
+    transposed_A=False,
+    transposed_B=False,
+):
+    sout = check_matmul(A, B, out, transposed_A, transposed_B)
+    if out is None:
+        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
+    if len(A.shape) == 3 and len(B.shape) == 3:
+        if A.shape[0] == B.shape[0] and A.shape[2] == B.shape[1]:
+            return batched_igemm(A, B, out)
+
+    sA = A.shape
+    sB = B.shape
+    if transposed_A and len(sA) == 2:
+        sA = (sA[1], sA[0])
+    elif transposed_A and len(sA) == 3:
+        sA = (sA[0], sA[2], sA[0])
+    if transposed_B and len(sB) == 2:
+        sB = (sB[1], sB[0])
+    elif transposed_B and len(sB) == 3:
+        sB = (sB[0], sB[2], sB[0])
+    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
+    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
+    # (transpose of row major is column major)
+    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
+
+    # matrices in the input arguments for cuBLAS
+    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
+    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
+    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
+    if len(sB) == 2:
+        if B.stride()[0] == B.shape[1]:
+            transposed_B = False
+        elif B.stride()[1] == B.shape[0]:
+            transposed_B = True
+        if len(A.shape) == 2:
+            if A.stride()[0] == A.shape[1]:
+                transposed_A = False
+            elif A.stride()[1] == A.shape[0]:
+                transposed_A = True
+        else:
+            if A.stride()[1] == A.shape[2]:
+                transposed_A = False
+            elif A.stride()[2] == A.shape[1]:
+                transposed_A = True
+
+        if len(sA) == 2:
+            n = sA[0]
+            ldb = A.stride()[1 if transposed_A else 0]
+        elif len(sA) == 3 and len(sB) == 2:
+            n = sA[0] * sA[1]
+            ldb = sA[2]
+
+        m = sB[1]
+        k = sB[0]
+        lda = B.stride()[(1 if transposed_B else 0)]
+        ldc = sB[1]
+    elif len(sB) == 3:
+        # special case
+        assert len(sA) == 3
+        if not (sA[0] == sB[0] and sA[1] == sB[1]):
+            raise ValueError(
+                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
+            )
+
+        transposed_A = True
+        transposed_B = False
+
+        m = sB[2]
+        n = sA[2]
+        k = sB[0] * sB[1]
+
+        lda = m
+        ldb = sA[2]
+        ldc = m
+
+    ptr = CUBLAS_Context.get_instance().get_context(A.device)
+
+    # B^T @ A^T = C^T
+    # [km, nk -> mn]
+    is_on_gpu([B, A, out])
+    lib.cigemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
+               get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc))
+    return out
+
+
+def batched_igemm(
+    A: Tensor,
+    B: Tensor,
+    out: Tensor = None,
+    transposed_A=False,
+    transposed_B=False,
+):
+    if not len(A.shape) == 3 or not len(B.shape) == 3:
+        raise ValueError(
+            f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}"
+        )
+    sout = check_matmul(A, B, out, transposed_A, transposed_B)
+    if out is None:
+        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
+
+    if B.is_contiguous():
+        lda = B.stride()[1]
+        transposed_A = False
+    else:
+        s = B.stride()
+        if s[0] != B.shape[0]:
+            B = B.contiguous()
+            lda = B.stride()[1]
+        elif s[2] == B.shape[1]:
+            transposed_A = True
+            lda = B.stride()[2]
+        else:
+            if s[2] == 1:
+                B = B.contiguous()
+                lda = B.stride()[1]
+            elif s[1] == 1:
+                B = B.contiguous()
+                lda = B.stride()[1]
+            else:
+                B = B.contiguous()
+                lda = B.stride()[1]
+
+    if A.is_contiguous():
+        ldb = A.stride()[1]
+        transposed_B = False
+    else:
+        s = A.stride()
+        if s[0] != A.shape[0]:
+            A = A.contiguous()
+            ldb = A.stride()[1]
+            transposed_B = False
+        elif s[2] == A.shape[1]:
+            ldb = A.stride()[2]
+            transposed_B = True
+        else:
+            A = A.contiguous()
+            ldb = A.stride()[1]
+            transposed_B = False
+
+    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
+    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
+    # (transpose of row major is column major)
+    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
+    # matrices in the input arguments for cuBLAS
+
+    # column major: A @ B = C: [batch, m, k] @ [batch, k, n] = [batch, m, n]
+    # row major: B^T @ A^T = C^T: [batch, m, k] @ [batch, k, n] = [batch, m, n]
+    # column major with row major layout: B^T @ A^T = C^T: [batch, k, m] @ [batch, n, k] = [batch, n, m]
+    num_batch = A.shape[0]
+    n = A.shape[1]
+    m = B.shape[2]
+    k = B.shape[1]
+
+    ldc = m
+
+    strideA = B.shape[1] * B.shape[2]
+    strideB = A.shape[1] * A.shape[2]
+    strideC = A.shape[1] * B.shape[2]
+
+    ptr = CUBLAS_Context.get_instance().get_context(A.device)
+
+    is_on_gpu([B, A, out])
+    lib.cbatched_igemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
+               get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc),
+               ct.c_long(strideA), ct.c_long(strideB), ct.c_long(strideC), ct.c_uint32(num_batch))
+    return out
+
+
+def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    shapeA = SA[0]
+    shapeB = SB[0]
+    dimsA = len(shapeA)
+    dimsB = len(shapeB)
+    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+    if dimsA == 2:
+        m = shapeA[0]
+    elif dimsA == 3:
+        m = shapeA[0] * shapeA[1]
+
+    rows = n = shapeB[0]
+    assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
+
+    # if the tensor is empty, return a transformed empty tensor with the right dimensions
+    if shapeA[0] == 0 and dimsA == 2:
+        return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
+    elif shapeA[1] == 0 and dimsA == 3:
+        return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
+
+    if dimsA == 2 and out is None:
+        out, Sout = get_transform_buffer(
+            (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
+        )
+    elif dimsA == 3 and out is None:
+        out, Sout = get_transform_buffer(
+            (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
+        )
+
+    assert dimsB != 3, "len(B.shape)==3 not supported"
+    assert A.device.type == "cuda"
+    assert B.device.type == "cuda"
+    assert A.dtype == torch.int8
+    assert B.dtype == torch.int8
+    assert out.dtype == dtype
+    assert SA[1] == "col32"
+    assert SB[1] in ["col_turing", "col_ampere"]
+    assert Sout[1] == "col32"
+    assert (
+        shapeA[-1] == shapeB[-1]
+    ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
+    formatB = SB[1]
+    prev_device = A.device
+    torch.cuda.set_device(A.device)
+
+    ptr = CUBLAS_Context.get_instance().get_context(A.device)
+    ptrA = get_ptr(A)
+    ptrB = get_ptr(B)
+    ptrC = get_ptr(out)
+
+    k = shapeA[-1]
+    lda = ct.c_int32(m * 32)
+    if formatB == "col_turing":
+        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+        # n = rows
+        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+    else:
+        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+        # n = rows
+        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+
+    ldc = ct.c_int32(m * 32)
+    m = ct.c_int32(m)
+    n = ct.c_int32(n)
+    k = ct.c_int32(k)
+
+    has_error = 0
+    ptrRowScale = get_ptr(None)
+    is_on_gpu([A, B, out])
+    if formatB == 'col_turing':
+        if dtype == torch.int32:
+            has_error = lib.cigemmlt_turing_32(
+                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+            )
+        else:
+            has_error = lib.cigemmlt_turing_8(
+                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+            )
+    elif formatB == "col_ampere":
+        if dtype == torch.int32:
+            has_error = lib.cigemmlt_ampere_32(
+                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+            )
+        else:
+            has_error = lib.cigemmlt_ampere_8(
+                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+            )
+
+    if has_error == 1:
+        print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
+        raise Exception('cublasLt ran into an error!')
+
+    torch.cuda.set_device(prev_device)
+
+    return out, Sout
+
+
+def mm_dequant(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None
+):
+    assert A.dtype == torch.int32
+    if bias is not None: assert bias.dtype == torch.float16
+    out_shape = quant_state[0]
+    if len(out_shape) == 3:
+        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+    if out is None:
+        out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
+    if new_row_stats is None:
+        new_row_stats = torch.empty(
+            out_shape[0], dtype=torch.float32, device=A.device
+        )
+    if new_col_stats is None:
+        new_col_stats = torch.empty(
+            out_shape[1], dtype=torch.float32, device=A.device
+        )
+    assert (
+        new_row_stats.shape[0] == row_stats.shape[0]
+    ), f"{new_row_stats.shape} vs {row_stats.shape}"
+    assert (
+        new_col_stats.shape[0] == col_stats.shape[0]
+    ), f"{new_col_stats.shape} vs {col_stats.shape}"
+
+    prev_device = pre_call(A.device)
+    ptrA = get_ptr(A)
+    ptrOut = get_ptr(out)
+    ptrRowStats = get_ptr(row_stats)
+    ptrColStats = get_ptr(col_stats)
+    ptrNewRowStats = get_ptr(new_row_stats)
+    ptrNewColStats = get_ptr(new_col_stats)
+    ptrBias = get_ptr(bias)
+    numRows = ct.c_int32(out_shape[0])
+    numCols = ct.c_int32(out_shape[1])
+
+    is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
+    lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
+    post_call(prev_device)
+
+    return out
+
+
+def get_colrow_absmax(
+    A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
+):
+    assert A.dtype == torch.float16
+    device = A.device
+
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        rows = A.shape[0]
+
+    col_tiles = (cols + 255) // 256
+    tiled_rows = ((rows + 15) // 16) * 16
+    if row_stats is None:
+        row_stats = torch.empty(
+            (rows,), dtype=torch.float32, device=device
+        ).fill_(-50000.0)
+    if col_stats is None:
+        col_stats = torch.empty(
+            (cols,), dtype=torch.float32, device=device
+        ).fill_(-50000.0)
+
+    if nnz_block_ptr is None and threshold > 0.0:
+        nnz_block_ptr = torch.zeros(
+            ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device
+        )
+
+    ptrA = get_ptr(A)
+    ptrRowStats = get_ptr(row_stats)
+    ptrColStats = get_ptr(col_stats)
+    ptrNnzrows = get_ptr(nnz_block_ptr)
+    rows = ct.c_int32(rows)
+    cols = ct.c_int32(cols)
+
+    prev_device = pre_call(A.device)
+    is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
+    lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
+    post_call(prev_device)
+
+    if threshold > 0.0:
+        nnz_block_ptr.cumsum_(0)
+
+    return row_stats, col_stats, nnz_block_ptr
+
+
+class COOSparseTensor:
+    def __init__(self, rows, cols, nnz, rowidx, colidx, values):
+        assert rowidx.dtype == torch.int32
+        assert colidx.dtype == torch.int32
+        assert values.dtype == torch.float16
+        assert values.numel() == nnz
+        assert rowidx.numel() == nnz
+        assert colidx.numel() == nnz
+
+        self.rows = rows
+        self.cols = cols
+        self.nnz = nnz
+        self.rowidx = rowidx
+        self.colidx = colidx
+        self.values = values
+
+
+class CSRSparseTensor:
+    def __init__(self, rows, cols, nnz, rowptr, colidx, values):
+        assert rowptr.dtype == torch.int32
+        assert colidx.dtype == torch.int32
+        assert values.dtype == torch.float16
+        assert values.numel() == nnz
+        assert colidx.numel() == nnz
+        assert rowptr.numel() == rows + 1
+
+        self.rows = rows
+        self.cols = cols
+        self.nnz = nnz
+        self.rowptr = rowptr
+        self.colidx = colidx
+        self.values = values
+
+
+class CSCSparseTensor:
+    def __init__(self, rows, cols, nnz, colptr, rowidx, values):
+        assert colptr.dtype == torch.int32
+        assert rowidx.dtype == torch.int32
+        assert values.dtype == torch.float16
+        assert values.numel() == nnz
+        assert rowidx.numel() == nnz
+        assert colptr.numel() == cols + 1
+
+        self.rows = rows
+        self.cols = cols
+        self.nnz = nnz
+        self.colptr = colptr
+        self.rowidx = rowidx
+        self.values = values
+
+
+def coo2csr(cooA):
+    values, counts = torch.unique(cooA.rowidx, return_counts=True)
+    values.add_(1)
+    rowptr = torch.zeros(
+        (cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device
+    )
+    rowptr.scatter_(index=values.long(), src=counts.int(), dim=0)
+    rowptr.cumsum_(0)
+    return CSRSparseTensor(
+        cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values
+    )
+
+
+def coo2csc(cooA):
+    val, col2rowidx = torch.sort(cooA.colidx)
+    rowidx = cooA.rowidx[col2rowidx]
+    values = cooA.values[col2rowidx]
+    colvalues, counts = torch.unique(val, return_counts=True)
+    colvalues.add_(1)
+    colptr = torch.zeros(
+        (cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device
+    )
+    colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0)
+    colptr.cumsum_(0)
+    return CSCSparseTensor(
+        cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values
+    )
+
+
+def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
+    rowidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
+    colidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
+    values = torch.zeros((nnz,), dtype=dtype, device=device)
+    return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
+
+
+def double_quant(
+    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+):
+    device = A.device
+    assert A.dtype == torch.half
+    assert device.type == "cuda"
+    prev_device = pre_call(A.device)
+
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        rows = A.shape[0]
+
+    if row_stats is None or col_stats is None:
+        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
+            A, threshold=threshold
+        )
+
+    if out_col is None:
+        out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
+    if out_row is None:
+        out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
+
+    coo_tensor = None
+    ptrA = get_ptr(A)
+    ptrColStats = get_ptr(col_stats)
+    ptrRowStats = get_ptr(row_stats)
+    ptrOutCol = get_ptr(out_col)
+    ptrOutRow = get_ptr(out_row)
+
+    is_on_gpu([A, col_stats, row_stats, out_col, out_row])
+    if threshold > 0.0:
+        nnz = nnz_row_ptr[-1].item()
+        if nnz > 0:
+            coo_tensor = coo_zeros(
+                A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
+            )
+            ptrRowIdx = get_ptr(coo_tensor.rowidx)
+            ptrColIdx = get_ptr(coo_tensor.colidx)
+            ptrVal = get_ptr(coo_tensor.values)
+            ptrRowPtr = get_ptr(nnz_row_ptr)
+
+            lib.cdouble_rowcol_quant(
+                ptrA,
+                ptrRowStats,
+                ptrColStats,
+                ptrOutCol,
+                ptrOutRow,
+                ptrRowIdx,
+                ptrColIdx,
+                ptrVal,
+                ptrRowPtr,
+                ct.c_float(threshold),
+                ct.c_int32(rows),
+                ct.c_int32(cols),
+            )
+            val, idx = torch.sort(coo_tensor.rowidx)
+            coo_tensor.rowidx = val
+            coo_tensor.colidx = coo_tensor.colidx[idx]
+            coo_tensor.values = coo_tensor.values[idx]
+        else:
+            lib.cdouble_rowcol_quant(
+                ptrA,
+                ptrRowStats,
+                ptrColStats,
+                ptrOutCol,
+                ptrOutRow,
+                None,
+                None,
+                None,
+                None,
+                ct.c_float(0.0),
+                ct.c_int32(rows),
+                ct.c_int32(cols),
+            )
+    else:
+        lib.cdouble_rowcol_quant(
+            ptrA,
+            ptrRowStats,
+            ptrColStats,
+            ptrOutCol,
+            ptrOutRow,
+            None,
+            None,
+            None,
+            None,
+            ct.c_float(threshold),
+            ct.c_int32(rows),
+            ct.c_int32(cols),
+        )
+    post_call(prev_device)
+
+    return out_row, out_col, row_stats, col_stats, coo_tensor
+
+
+def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    prev_device = pre_call(A.device)
+    if state is None: state = (A.shape, from_order)
+    else: from_order = state[1]
+    if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
+    else: new_state = (state[0], to_order) # (shape, order)
+
+    shape = state[0]
+    if len(shape) == 2:
+        dim1 = ct.c_int32(shape[0])
+        dim2 = ct.c_int32(shape[1])
+    else:
+        dim1 = ct.c_int32(shape[0] * shape[1])
+        dim2 = ct.c_int32(shape[2])
+
+    is_on_gpu([A, out])
+    if to_order == 'col32':
+        if transpose:
+            lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
+        else:
+            lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
+    elif to_order == "col_turing":
+        if transpose:
+            lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
+        else:
+            lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
+    elif to_order == "col_ampere":
+        if transpose:
+            lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
+        else:
+            lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
+    elif to_order == "row":
+        if from_order == "col_turing":
+            lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif from_order == "col_ampere":
+            lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
+    else:
+        raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
+
+    post_call(prev_device)
+
+    return out, new_state
+
+
+def spmm_coo(cooA, B, out=None):
+    if out is None:
+        out = torch.empty(
+            (cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype
+        )
+    nnz = cooA.nnz
+    assert cooA.rowidx.numel() == nnz
+    assert cooA.colidx.numel() == nnz
+    assert cooA.values.numel() == nnz
+    assert cooA.cols == B.shape[0]
+
+    transposed_B = False if B.is_contiguous() else True
+
+    ldb = B.stride()[(1 if transposed_B else 0)]
+    ldc = B.shape[1]
+
+    ptr = Cusparse_Context.get_instance().context
+
+    ptrRowidx = get_ptr(cooA.rowidx)
+    ptrColidx = get_ptr(cooA.colidx)
+    ptrValues = get_ptr(cooA.values)
+    ptrB = get_ptr(B)
+    ptrC = get_ptr(out)
+    cnnz = ct.c_int32(cooA.nnz)
+    crowsA = ct.c_int32(cooA.rows)
+    ccolsA = ct.c_int32(cooA.cols)
+    ccolsB = ct.c_int32(B.shape[1])
+    cldb = ct.c_int32(ldb)
+    cldc = ct.c_int32(ldc)
+
+    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out])
+    lib.cspmm_coo(ptr, ptrRowidx, ptrColidx, ptrValues, cnnz, crowsA, ccolsA, ccolsB, cldb, ptrB, cldc, ptrC, ct.c_bool(transposed_B))
+
+    return out
+
+
+def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
+    if out is None:
+        out = torch.zeros(
+            (cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype
+        )
+    nnz = cooA.nnz
+    prev_device = pre_call(B.device)
+    assert cooA.rowidx.numel() == nnz
+    assert cooA.colidx.numel() == nnz
+    assert cooA.values.numel() == nnz
+    assert cooA.cols == B.shape[0], f"{cooA.cols} vs {B.shape}"
+
+    transposed_B = False if B.is_contiguous() else True
+
+    ldb = B.stride()[(1 if transposed_B else 0)]
+    ldc = B.shape[1]
+
+    values, counts = torch.unique(cooA.rowidx, return_counts=True)
+    offset = counts.cumsum(0).int()
+    max_count, max_idx = torch.sort(counts, descending=True)
+    max_idx = max_idx.int()
+    max_count = max_count.int()
+    assert (
+        max_count[0] <= 32
+    ), f"Current max count per row is 8 but found {max_count[0]}."
+    assert B.dtype in [torch.float16, torch.int8]
+    ptrOffset = get_ptr(offset)
+    ptrMaxCount = get_ptr(max_count)
+    ptrMaxIdx = get_ptr(max_idx)
+
+    ptrRowidx = get_ptr(cooA.rowidx)
+    ptrColidx = get_ptr(cooA.colidx)
+    ptrValues = get_ptr(cooA.values)
+    ptrB = get_ptr(B)
+    ptrC = get_ptr(out)
+    ptrDequantStats = get_ptr(dequant_stats)
+    cnnz_rows = ct.c_int32(counts.numel())
+    cnnz = ct.c_int32(cooA.nnz)
+    crowsA = ct.c_int32(cooA.rows)
+    ccolsA = ct.c_int32(cooA.cols)
+    crowsB = ct.c_int32(B.shape[1])
+    ccolsB = ct.c_int32(B.shape[1])
+    cldb = ct.c_int32(ldb)
+    cldc = ct.c_int32(ldc)
+
+    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats])
+    if B.dtype == torch.float16:
+        lib.cspmm_coo_very_sparse_naive_fp16(
+            ptrMaxCount,
+            ptrMaxIdx,
+            ptrOffset,
+            ptrRowidx,
+            ptrColidx,
+            ptrValues,
+            ptrB,
+            ptrC,
+            ptrDequantStats,
+            cnnz_rows,
+            cnnz,
+            crowsA,
+            crowsB,
+            ccolsB,
+        )
+    elif B.dtype == torch.int8:
+        lib.cspmm_coo_very_sparse_naive_int8(
+            ptrMaxCount,
+            ptrMaxIdx,
+            ptrOffset,
+            ptrRowidx,
+            ptrColidx,
+            ptrValues,
+            ptrB,
+            ptrC,
+            ptrDequantStats,
+            cnnz_rows,
+            cnnz,
+            crowsA,
+            crowsB,
+            ccolsB,
+        )
+    # else: assertion error
+    post_call(prev_device)
+
+    return out
+
+
+C = 127.0
+
+
+def vectorwise_quant(x, dim=1, quant_type="vector"):
+    if quant_type == "linear":
+        max1 = torch.abs(x).max().float()
+        xq = torch.round(x / max1 * 127).to(torch.int8)
+        return xq, max1
+    elif quant_type in ["vector", "row"]:
+        max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
+        xq = torch.round(x * (C / max1)).to(torch.int8)
+        return xq, max1
+    elif quant_type == "zeropoint":
+        dtype = x.dtype
+        x = x.float()
+        dyna = x.max() - x.min()
+        if dyna == 0:
+            dyna = 1
+        qx = 255.0 / dyna
+        minx = x.min()
+        zpx = torch.round(minx * qx)
+        x = torch.round(qx * x - zpx) + zpx
+        return x, qx
+    elif quant_type in ["vector-zeropoint", "row-zeropoint"]:
+        dtype = x.dtype
+        x = x.float()
+        dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(
+            x, dim=dim, keepdim=True
+        )
+        dyna[dyna == 0] = 1
+        qx = 255.0 / dyna
+        minx = torch.amin(x, dim=dim, keepdim=True)
+        zpx = torch.round(minx * qx)
+        x = torch.round(qx * x - zpx) + zpx
+        return x, qx
+    elif quant_type == "truncated-vector":
+        with torch.no_grad():
+            absx = torch.abs(x)
+            max1 = torch.amax(absx, dim=dim, keepdim=True)
+            max1 = max1 * 0.7
+            idx = absx > max1.expand_as(absx)
+            sign = torch.sign(x[idx])
+            x[idx] = max1.expand_as(absx)[idx] * sign
+            xq = torch.round(x / max1 * C).to(torch.int8)
+        return xq, max1
+    else:
+        return None
+
+
+def vectorwise_dequant(xq, max1, quant_type="vector"):
+    if quant_type == "vector":
+        x = (xq / C * max1).to(torch.float32)
+        return x
+    else:
+        return None
+
+
+def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
+    if quant_type == "linear":
+        norm = S1 * S2 / (C * C)
+        # double cast needed to prevent overflows
+        return (xq.float() * norm).to(dtype)
+    elif quant_type == "zeropoint":
+        norm = 1.0 / (S1 * S2)
+        return (xq.float() * norm).to(dtype)
+    elif quant_type == "row-zeropoint":
+        norm = 1.0 / (S1 * S2)
+        x = xq.float()
+        if len(S1.shape) == 3 and len(x.shape) == 2:
+            S1 = S1.squeeze(0)
+        if len(S2.shape) == 3 and len(x.shape) == 2:
+            S2 = S2.squeeze(0)
+        if len(S1.shape) == 2:
+            x *= norm
+        else:
+            x *= norm
+        return x.to(dtype)
+    elif quant_type == "vector-zeropoint":
+        x = xq.float()
+        if len(S1.shape) == 3 and len(x.shape) == 2:
+            S1 = S1.squeeze(0)
+        if len(S2.shape) == 3 and len(x.shape) == 2:
+            S2 = S2.squeeze(0)
+        if len(S1.shape) == 2:
+            x *= 1.0 / S1
+        else:
+            x *= 1.0 / S1
+        x *= 1.0 / S2.t()
+        return x.to(dtype)
+    elif quant_type == "row":
+        x = xq.float()
+        if len(S1.shape) == 3 and len(x.shape) == 2:
+            S1 = S1.squeeze(0)
+        if len(S2.shape) == 3 and len(x.shape) == 2:
+            S2 = S2.squeeze(0)
+        if len(S1.shape) == 2:
+            x *= S1 * S2 / (C * C)
+        else:
+            x *= S1 * S2 / (C * C)
+        return x.to(dtype)
+    elif quant_type in ["truncated-vector", "vector"]:
+        x = xq.float()
+        if len(S1.shape) == 3 and len(x.shape) == 2:
+            S1 = S1.squeeze(0)
+        if len(S2.shape) == 3 and len(x.shape) == 2:
+            S2 = S2.squeeze(0)
+        if len(S1.shape) == 2:
+            x *= S1 / C
+        else:
+            x *= S1 / C
+        x *= S2 / C
+        return x.to(dtype)
+    else:
+        return None
+
+
+def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
+    offset = B.float().t().sum(0) * (SA[0] + SA[1])
+    x = xq.float()
+    if len(xq.shape) == 2 and len(SB.shape) == 3:
+        SB = SB.squeeze(0)
+    if len(SB.shape) == 2:
+        x *= SB.t() / 127
+    else:
+        x *= SB / 127
+    x *= SA[1] / 127
+    x += offset
+    return x.to(dtype)
+
+
+def extract_outliers(A, SA, idx):
+    shapeA = SA[0]
+    formatA = SA[1]
+    assert formatA in ["col_turing", "col_ampere"]
+    assert A.device.type == "cuda"
+
+    out = torch.zeros(
+        (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
+    )
+
+    idx_size = ct.c_int32(idx.numel())
+    rows = ct.c_int32(shapeA[0])
+    cols = ct.c_int32(shapeA[1])
+    ptrA = get_ptr(A)
+    ptrIdx = get_ptr(idx)
+    ptrOut = get_ptr(out)
+
+    prev_device = pre_call(A.device)
+    if formatA == 'col_turing':
+        lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+    elif formatA == "col_ampere":
+        lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+    post_call(prev_device)
+
+    return out
+
+def pipeline_test(A, batch_size):
+    out = torch.zeros_like(A)
+    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
+    return out
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index f5739111d..d6329a709 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -387,7 +387,6 @@ def evaluate_cuda_setup():
     if torch.version.hip: 
         return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None, None
 
-    print("WHAT THE FUCK IS THIS")
 
     cuda_setup = CUDASetup.get_instance()
     cudart_path = determine_cuda_runtime_lib_path()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 226c9e51f..91c1ab5e4 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -18,6 +18,8 @@
 
 from .cextension import COMPILED_WITH_CUDA, lib
 
+# Remark: for AMD GPU we need to disable blocksize == 64
+
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -612,7 +614,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
         out = torch.zeros_like(A, dtype=torch.uint8)
 
     if A.device.type != 'cpu':
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128]
         cblocksize = ct.c_int32(blocksize)
         prev_device = pre_call(A.device)
         code = code.to(A.device)
@@ -698,8 +700,8 @@ def dequantize_blockwise(
     if A.device.type != 'cpu':
         device = pre_call(A.device)
         code = code.to(A.device)
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+        if blocksize not in [2048, 4096, 1024, 512, 256, 128]:
+            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128]")
         is_on_gpu([A, absmax, out])
         if out.dtype == torch.float32:
             lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
@@ -714,13 +716,13 @@ def dequantize_blockwise(
 
     return out
 
-def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
 
-def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
 
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False, quant_type='fp4') -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
 
@@ -763,7 +765,7 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     if out is None:
         out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
 
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128]
 
     prev_device = pre_call(A.device)
     is_on_gpu([A, out, absmax])
@@ -795,13 +797,13 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
 
     return out, state
 
-def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
 
-def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
 
-def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128, quant_type='fp4') -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.
 
@@ -828,8 +830,8 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
     torch.Tensor:
         Dequantized tensor.
     """
-    if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+    if blocksize not in [2048, 4096, 1024, 512, 256, 128]:
+        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128]")
     if quant_type not in ['fp4', 'nf4']:
         raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index b10d45ab0..d100bbbc5 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -137,7 +137,8 @@ def forward(self, input: Tensor) -> Tensor:
         return emb
 
 class Params4bit(torch.nn.Parameter):
-    def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'):
+    # Remark: change blocksize to 128 for AMD gpu
+    def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=128, compress_statistics=True, quant_type='fp4'):
         if data is None:
             data = torch.empty(0)
 

From c3300208eece46606a55756e35b3567370fb3c70 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Wed, 12 Jul 2023 10:59:39 -0500
Subject: [PATCH 008/234] Update README.md

---
 README.md | 70 ++++++++++++++++++-------------------------------------
 1 file changed, 23 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 727a86cb5..1ec8f5ec2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
-# bitsandbytes
+# bitsandbytes-rocm
 
 The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
+This fork is the ROCm adaptation of bitsandbytes 0.39.1. The repo is inspired by [agrocylo/bitsandbytes-rocm](https://github.com/agrocylo/bitsandbytes-rocm/tree/main/bitsandbytes), which is a ROCm version of bitsandbytes 0.37. While this fork incorporating the majority of features from bitsandbytes 0.39.1, including the crucial 4 bit quantization feature, certain features such as hipblaslt and hip_bfloat16 have been disabled. Enabling these features is listed as a task for the future.
 
 
 
@@ -11,26 +12,37 @@ Resources:
 
 ## TL;DR
 **Requirements**
-Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
+Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + ROCm >= 5.4.2 or CUDA > 10.0
 
-(Deprecated: CUDA 10.0 is deprecated and only CUDA >= 11.0) will be supported with release 0.39.0)
 
 **Installation**:
 
-``pip install bitsandbytes``
 
-In some cases it can happen that you need to compile from source. If this happens please consider submitting a bug report with `python -m bitsandbytes` information. What now follows is some short instructions which might work out of the box if `nvcc` is installed. If these do not work see further below.
+You need to compile from source. 
 
 Compilation quickstart:
 ```bash
-git clone https://github.com/timdettmers/bitsandbytes.git
-cd bitsandbytes
+git clone [https://github.com/timdettmers/bitsandbytes.git](https://github.com/Lzy17/bitsandbytes-rocm)
+cd bitsandbytes-rocm
 
-# CUDA_VERSIONS in {110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 120}
-# make argument in {cuda110, cuda11x, cuda12x}
-# if you do not know what CUDA you have, try looking at the output of: python -m bitsandbytes
-CUDA_VERSION=117 make cuda11x
+make hip
 python setup.py install
+
+#to test if you have successfully installed
+python -m bitsandbytes
+
+#To be benchmarks accuray benchmark from https://github.com/TimDettmers/bitsandbytes/issues/565
+cd benchmarking/accuracy
+python bnb_accuracy.py
+
+#Accurate results should looks like
+#tensor(526.7872, device='cuda:0')
+#tensor(551.2297, device='cuda:0')
+#tensor(574.9075, device='cuda:0')
+#tensor(3435.1819, device='cuda:0')
+#tensor(3480.1541, device='cuda:0')
+
+#
 ```
 
 **Using Int8 inference with HuggingFace Transformers**
@@ -75,23 +87,6 @@ out = linear(x.to(torch.float16))
 - 8-bit quantization: Quantile, Linear, and Dynamic quantization
 - Fast quantile estimation: Up to 100x faster than other algorithms
 
-## Requirements & Installation
-
-Requirements: anaconda, cudatoolkit, pytorch
-
-Hardware requirements:
- - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or older).
- - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X).
-
-Supported CUDA versions: 10.2 - 12.0
-
-The bitsandbytes library is currently only supported on Linux distributions. Windows is not supported at the moment.
-
-The requirements can best be fulfilled by installing pytorch via anaconda. You can install PyTorch by following the ["Get Started"](https://pytorch.org/get-started/locally/) instructions on the official website.
-
-To install run:
-
-``pip install bitsandbytes``
 
 ## Using bitsandbytes
 
@@ -142,25 +137,6 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available)
 2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_)
 
-## Compile from source
-To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands.
-
-```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
-# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
-
-# For example, the following installs CUDA 11.8 to ~/local/cuda-11.8 and exports the path to your .bashrc
-bash cuda install 118 ~/local 1 
-```
-
-To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
-
-``CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x``
-
-For more detailed instruction, please follow the [compile_from_source.md](compile_from_source.md) instructions.
-
 ## License
 
 The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license.

From fcee2d6633e31eda882032573cea8983f4e047e0 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Wed, 12 Jul 2023 11:20:34 -0500
Subject: [PATCH 009/234] add benchmarks

---
 benchmarking/accuracy/bnb_accuracy.py | 29 +++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 benchmarking/accuracy/bnb_accuracy.py

diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py
new file mode 100644
index 000000000..bd3b81db4
--- /dev/null
+++ b/benchmarking/accuracy/bnb_accuracy.py
@@ -0,0 +1,29 @@
+import torch
+import bitsandbytes as bnb
+from bitsandbytes import functional as F
+
+
+
+
+def debug_blocksize(block):
+    x = torch.randn(4096, 4096).cuda()
+    qx, qstate = F.quantize_fp4(x, blocksize=block)
+    dq = F.dequantize_fp4(qx, qstate)
+    return torch.sum(torch.linalg.norm(x - dq, ord="fro"))
+
+def test_blocksize(block):
+    x = torch.randn(10, 10).cuda()
+    qx, qstate = F.quantize_fp4(x, blocksize=block)
+    print(x)
+    print("---------------")
+    print(qx)
+    print("---------------")
+    print(qstate)
+
+    
+
+
+for block in [128, 256, 512, 1024, 2048]:
+    print(debug_blocksize(block))
+
+#test_blocksize(2048)

From 4c0ca08aa24d622940d9abdcff6090efc85dbc30 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Tue, 18 Jul 2023 11:11:42 -0500
Subject: [PATCH 010/234] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1ec8f5ec2..ff5246750 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ You need to compile from source.
 
 Compilation quickstart:
 ```bash
-git clone [https://github.com/timdettmers/bitsandbytes.git](https://github.com/Lzy17/bitsandbytes-rocm)
+git clone https://github.com/Lzy17/bitsandbytes-rocm
 cd bitsandbytes-rocm
 
 make hip

From c79861683853b3fda9c36048e6d5b95a504fc0ec Mon Sep 17 00:00:00 2001
From: jpvillam <Juan.Villamizar@amd.com>
Date: Wed, 11 Oct 2023 16:27:37 -0400
Subject: [PATCH 011/234] First draft, getting error

---
 Makefile               |   8 ++--
 csrc/ops.hip           | 101 +++++++++++++++++++++--------------------
 csrc/ops.hiph          |  10 ++--
 csrc/pythonInterface.c |   2 +-
 4 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/Makefile b/Makefile
index 75c27acfb..1e12c8e3a 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
 
 INCLUDE_ROCM := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-LIB_ROCM := -L $(ROCM_HOME)/lib -lhipblas -lhiprand -lhipsparse -L $(CONDA_PREFIX)/lib
+LIB_ROCM := -L $(ROCM_HOME)/lib -lhipblas -lhipblaslt -lhiprand -lhipsparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
 COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
@@ -107,9 +107,9 @@ cuda12x: $(BUILD_DIR) env
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 
 hip: $(BUILD_DIR) env
-	$(HIPCC) -std=c++14 -fPIC -c -DNO_HIPBLASLT $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/ops.hip -o $(BUILD_DIR)/ops.o
-	$(HIPCC) -std=c++14 -fPIC -c -DNO_HIPBLASLT $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/kernels.hip -o $(BUILD_DIR)/kernels.o
-	$(GPP) -std=c++14 -D__HIP_PLATFORM_AMD__ -DBUILD_HIP -DNO_HIPBLASLT -shared -fPIC $(INCLUDE_ROCM) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so $(LIB_ROCM)
+	$(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/ops.hip -o $(BUILD_DIR)/ops.o
+	$(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/kernels.hip -o $(BUILD_DIR)/kernels.o
+	$(GPP) -std=c++14 -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ -DBUILD_HIP -shared -fPIC $(INCLUDE_ROCM) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so $(LIB_ROCM)
 
 cpuonly: $(BUILD_DIR) env
 	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 9dcdcc012..3a0747b91 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -310,32 +310,35 @@ template<int ORDER> hipblasLtOrder_t get_order()
 	switch(ORDER)
 	{
 		case ROW:
-      return hipblasLt_ORDER_ROW;
+      return HIPBLASLT_ORDER_ROW;
 			break;
     case COL:
-      return hipblasLt_ORDER_COL;
+      return HIPBLASLT_ORDER_COL;
       break;
     case COL32:
-      return hipblasLt_ORDER_COL32;
+      //return HIPBLASLT_ORDER_COL32;
+      return HIPBLASLT_ORDER_COL;
       break;
     case COL_TURING:
-      return hipblasLt_ORDER_COL4_4R2_8C;
-      break;
+      //return HIPBLASLT_ORDER_COL4_4R2_8C;
+      return HIPBLASLT_ORDER_COL;
+     break;
     case COL_AMPERE:
-      return hipblasLt_ORDER_COL32_2R_4R4;
+      //return HIPBLASLT_ORDER_COL32_2R_4R4;
+      return HIPBLASLT_ORDER_COL;
       break;
 		default:
 			break;
   }
 
-	return hipblasLt_ORDER_ROW;
+	return HIPBLASLT_ORDER_ROW;
 }
 
 template hipblasLtOrder_t get_order<ROW>();
 template hipblasLtOrder_t get_order<COL>();
-template hipblasLtOrder_t get_order<COL32>();
-template hipblasLtOrder_t get_order<COL_TURING>();
-template hipblasLtOrder_t get_order<COL_AMPERE>();
+//template hipblasLtOrder_t get_order<COL32>();
+//template hipblasLtOrder_t get_order<COL_TURING>();
+//template hipblasLtOrder_t get_order<COL_AMPERE>();
 #endif
 
 
@@ -380,37 +383,37 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
 
   hipblasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
   hipblasLtMatrixTransformDesc_t A2Out_desc = NULL;
-  rocblas_operation opTranspose = rocblas_operation_transpose;
+  hipblasOperation_t opTranspose = HIPBLAS_OP_T;
   float transformAlpha = 1.0f, transformBeta = 0.0f;
 
 
   if(DTYPE == 8)
   {
-    checkCublasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLAS_R_8I, dim1, dim2, ldA));
-    checkCublasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLAS_R_8I, dim1, dim2, ldOut));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_8I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_8I, dim1, dim2, ldOut));
   }
   else if(DTYPE == 32)
   {
-    checkCublasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLAS_R_32I, dim1, dim2, ldA));
-    checkCublasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLAS_R_32I, dim1, dim2, ldOut));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_32I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_32I, dim1, dim2, ldOut));
   }
   else
   {
     printf("ERROR WRONG TYPE FOR TRANSFORM: %i\n", DTYPE);
   }
 
-  checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(A_desc, hipblasLt_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
-  checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(out_desc, hipblasLt_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
+  checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(A_desc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
+  checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(out_desc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
 
-  checkCublasStatus(hipblasLtMatrixTransformDescCreate(&A2Out_desc, HIPBLAS_R_32F));
+  checkHipblasStatus(hipblasLtMatrixTransformDescCreate(&A2Out_desc, HIPBLASLT_R_32F));
 
-  if(transpose){ checkCublasStatus(hipblasLtMatrixTransformDescSetAttribute(A2Out_desc, hipblasLt_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
+  if(transpose){ checkHipblasStatus(hipblasLtMatrixTransformDescSetAttribute(A2Out_desc, HIPBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
 
-  checkCublasStatus(hipblasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
+  checkHipblasStatus(hipblasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
 
-  if (A_desc) checkCublasStatus(hipblasLtMatrixLayoutDestroy(A_desc));
-  if (out_desc) checkCublasStatus(hipblasLtMatrixLayoutDestroy(out_desc));
-  if (A2Out_desc) checkCublasStatus(hipblasLtMatrixTransformDescDestroy(A2Out_desc));
+  if (A_desc) checkHipblasStatus(hipblasLtMatrixLayoutDestroy(A_desc));
+  if (out_desc) checkHipblasStatus(hipblasLtMatrixLayoutDestroy(out_desc));
+  if (A2Out_desc) checkHipblasStatus(hipblasLtMatrixTransformDescDestroy(A2Out_desc));
 }
 
 template void transform<int8_t, ROW, COL, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
@@ -439,53 +442,53 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     int has_error = 0;
     hipblasLtMatmulDesc_t matmulDesc = NULL;
     hipblasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    rocblas_operation opT = rocblas_operation_transpose;
-    hipblasLtPointerMode_t alphaVec = hipblasLt_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-    hipblasLtOrder_t col32 = hipblasLt_ORDER_COL32;
-    hipblasLtOrder_t col_turing = hipblasLt_ORDER_COL4_4R2_8C;
-    hipblasLtOrder_t col_ampere = hipblasLt_ORDER_COL32_2R_4R4;
+    hipblasOperation_t opT = HIPBLAS_OP_T;
+    //hipblasLtPointerMode_t alphaVec = hipblasLt_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    hipblasLtOrder_t col32 = HIPBLASLT_ORDER_COL;
+    hipblasLtOrder_t col_turing = HIPBLASLT_ORDER_COL;
+    hipblasLtOrder_t col_ampere = HIPBLASLT_ORDER_COL;
 
-    has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIPBLAS_R_8I, m, k, lda));
-    has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIPBLAS_R_8I, n, k, ldb));
+    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIPBLASLT_R_8I, m, k, lda));
+    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIPBLASLT_R_8I, n, k, ldb));
 
-    has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
     if(FORMATB == COL_TURING)
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
     else
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
 
     if(DTYPE_OUT == 32)
     {
-      has_error |= checkCublasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIPBLAS_R_32I));
-      has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLAS_R_32I, m, n, ldc));
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLASLT_COMPUTE_I32, HIPBLASLT_R_32I));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(int32_t)));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_32I, m, n, ldc));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       int alpha = 1, beta = 0;
-      has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+      has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
     }
     else
     {
-      has_error |= checkCublasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIPBLAS_R_32F));
-      has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLAS_R_8I, m, n, ldc));
-      has_error |= checkCublasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, hipblasLt_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLASLT_COMPUTE_I32, HIPBLASLT_R_32F));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_8I, m, n, ldc));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       if(!SCALE_ROWS)
       {
         float alpha = 1.0f, beta = 0.0f;
-        has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
       }
       else
       {
-        has_error |= checkCublasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
-        has_error |= checkCublasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+        //has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
       }
     }
 
 
-    if (Cdesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Cdesc));
-    if (Bdesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Bdesc));
-    if (Adesc) has_error |= checkCublasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
-    if (matmulDesc) has_error |= checkCublasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
+    if (Cdesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Cdesc));
+    if (Bdesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Bdesc));
+    if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
+    if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
     if(has_error == 1)
       printf("error detected");
 
diff --git a/csrc/ops.hiph b/csrc/ops.hiph
index dc7bbffa5..2a671509f 100644
--- a/csrc/ops.hiph
+++ b/csrc/ops.hiph
@@ -16,9 +16,9 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_fp16.h>
 #include <rocblas/rocblas.h>
-#ifndef NO_HIPBLASLT
+//#ifndef NO_HIPBLASLT
 #include <hipblaslt/hipblaslt.h>
-#endif
+//#endif
 #include <hipsparse/hipsparse.h>
 #include <vector>
 #include <functional>
@@ -59,8 +59,8 @@ inline void v(hipError_t status) {
     }
 }
 
-inline int checkHiblasStatus(rocblas_status status) {
-    if (status != rocblas_status_success) {
+inline int checkHipblasStatus(hipblasStatus_t status) {
+    if (status != HIPBLAS_STATUS_SUCCESS) {
         printf("hipBLAS API failed with status %d\n", status);
         //throw std::logic_error("cuBLAS API failed");
         return 1;
@@ -129,7 +129,7 @@ class ContextLt
 				ContextLt()
 				{
 					hipblasLtHandle_t handle;
-					rocblasLtCreate(&handle);
+					hipblasLtCreate(&handle);
 					m_handle = handle;
 				}
 };
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 704d922cb..f84e0e8e5 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -159,7 +159,7 @@ void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int row
 
 #ifndef NO_HIPBLASLT
 
-#if BUILD_CUDA
+#if defined(BUILD_CUDA)
  int igemmlt_turing_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
 

From 37045e51cf464c8bd015dfe76c7be5db10909451 Mon Sep 17 00:00:00 2001
From: jpvillam-amd <juan.villamizar@amd.com>
Date: Thu, 19 Oct 2023 07:57:45 -0700
Subject: [PATCH 012/234] Small transform fix, still errors on igemm

---
 csrc/ops.hip | 159 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 149 insertions(+), 10 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 3a0747b91..06ff5a0ae 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -336,7 +336,7 @@ template<int ORDER> hipblasLtOrder_t get_order()
 
 template hipblasLtOrder_t get_order<ROW>();
 template hipblasLtOrder_t get_order<COL>();
-//template hipblasLtOrder_t get_order<COL32>();
+template hipblasLtOrder_t get_order<COL32>();
 //template hipblasLtOrder_t get_order<COL_TURING>();
 //template hipblasLtOrder_t get_order<COL_AMPERE>();
 #endif
@@ -352,7 +352,10 @@ template<int ORDER> int get_leading_dim(int dim1, int dim2)
     case COL:
       return dim1;
       break;
-    case COL32:
+    default:
+      return dim1;
+      break;
+    /*case COL32:
       // 32*row tiles
       return dim1*32;
       break;
@@ -366,6 +369,7 @@ template<int ORDER> int get_leading_dim(int dim1, int dim2)
 		default:
 			return 0;
 			break;
+     */
   }
 }
 
@@ -381,20 +385,22 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
   int ldA = get_leading_dim<SRC>(dim1, dim2);
   int ldOut = get_leading_dim<TARGET>(dim1, dim2);
 
-  hipblasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
+  hipblasLtMatrixLayout_t A_desc = NULL, out_desc = NULL, B_desc = NULL;
+  T B = T(0);
   hipblasLtMatrixTransformDesc_t A2Out_desc = NULL;
   hipblasOperation_t opTranspose = HIPBLAS_OP_T;
   float transformAlpha = 1.0f, transformBeta = 0.0f;
 
-
   if(DTYPE == 8)
   {
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_8I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIPBLASLT_R_8I, 0, 0, 0));
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_8I, dim1, dim2, ldOut));
   }
   else if(DTYPE == 32)
   {
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_32I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIPBLASLT_R_32I, 0, 0, 0));
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_32I, dim1, dim2, ldOut));
   }
   else
@@ -409,9 +415,10 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
 
   if(transpose){ checkHipblasStatus(hipblasLtMatrixTransformDescSetAttribute(A2Out_desc, HIPBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
 
-  checkHipblasStatus(hipblasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, NULL, NULL, out, out_desc, 0));
+  checkHipblasStatus(hipblasLtMatrixTransform(ltHandle, A2Out_desc, &transformAlpha, A, A_desc, &transformBeta, A, B_desc, out, out_desc, 0));
 
   if (A_desc) checkHipblasStatus(hipblasLtMatrixLayoutDestroy(A_desc));
+  if (B_desc) checkHipblasStatus(hipblasLtMatrixLayoutDestroy(B_desc));
   if (out_desc) checkHipblasStatus(hipblasLtMatrixLayoutDestroy(out_desc));
   if (A2Out_desc) checkHipblasStatus(hipblasLtMatrixTransformDescDestroy(A2Out_desc));
 }
@@ -425,7 +432,60 @@ template void transform<int8_t, ROW, COL_AMPERE, false, 8>(hipblasLtHandle_t ltH
 template void transform<int8_t, COL32, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int32_t, COL32, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
 #endif
-
+static std::string hipError_to_string(const hipError_t ret)
+{
+	switch(ret)
+	{
+		case hipSuccess:
+			return "hipSuccess";
+		case hipErrorInvalidContext:
+			return "hipErrorInvalidContext";
+		case hipErrorInvalidKernelFile:
+			return "hipErrorInvalidKernelFile";
+		case hipErrorMemoryAllocation:
+			return "hipErrorMemoryAllocation";
+		case hipErrorInitializationError:
+			return "hipErrorInitializationError";
+		case hipErrorLaunchFailure:
+			return "hipErrorLaunchFailure";
+		case hipErrorLaunchOutOfResources:
+			return "hipErrorLaunchOutOfResources";
+		case hipErrorInvalidDevice:
+			return "hipErrorInvalidDevice";
+		case hipErrorInvalidValue:
+			return "hipErrorInvalidValue";
+		case hipErrorInvalidDevicePointer:
+			return "hipErrorInvalidDevicePointer";
+		case hipErrorInvalidMemcpyDirection:
+			return "hipErrorInvalidMemcpyDirection";
+		case hipErrorUnknown:
+			return "hipErrorUnknown";
+		case hipErrorInvalidResourceHandle:
+			return "hipErrorInvalidResourceHandle";
+		case hipErrorNotReady:
+			return "hipErrorNotReady";
+		case hipErrorNoDevice:
+			return "hipErrorNoDevice";
+		case hipErrorPeerAccessAlreadyEnabled:
+			return "hipErrorPeerAccessAlreadyEnabled";
+		case hipErrorPeerAccessNotEnabled:
+			return "hipErrorPeerAccessNotEnabled";
+		case hipErrorRuntimeMemory:
+			return "hipErrorRuntimeMemory";
+		case hipErrorRuntimeOther:
+			return "hipErrorRuntimeOther";
+		case hipErrorHostMemoryAlreadyRegistered:
+			return "hipErrorHostMemoryAlreadyRegistered";
+		case hipErrorHostMemoryNotRegistered:
+			return "hipErrorHostMemoryNotRegistered";
+		case hipErrorMapBufferObjectFailed:
+			return "hipErrorMapBufferObjectFailed";
+		case hipErrorTbd:
+			return "hipErrorTbd";
+		default:
+			throw std::runtime_error("unknown hipError");
+	}
+}
 #ifndef NO_HIPBLASLT
 template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 {
@@ -450,21 +510,69 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
 
     has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIPBLASLT_R_8I, m, k, lda));
     has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIPBLASLT_R_8I, n, k, ldb));
-
     has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+
+
     if(FORMATB == COL_TURING)
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
     else
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
 
+    //Set User Preference attributes
+    int64_t max_workspace_size = 32 * 1024 * 1024 * 4;
+    void*    d_workspace;
+    //NEED HIP CHECK ERROR
+    //hipMalloc(&d_workspace, max_workspace_size);
+
     if(DTYPE_OUT == 32)
     {
       has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLASLT_COMPUTE_I32, HIPBLASLT_R_32I));
+      auto opA = HIPBLAS_OP_N;
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(int32_t)));
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(int32_t)));
+      hipblasLtEpilogue_t epilogue = HIPBLASLT_EPILOGUE_DEFAULT;
+      checkHipblasStatus(hipblasLtMatmulDescSetAttribute(
+        matmulDesc, HIPBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_32I, m, n, ldc));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       int alpha = 1, beta = 0;
-      has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+
+
+      /* Algo and workspace TODO: need to rework to not be duplicated */
+      // Set User Preference attributes
+      hipblasLtMatmulPreference_t pref;
+      checkHipblasStatus(hipblasLtMatmulPreferenceCreate(&pref));
+      checkHipblasStatus(
+          hipblasLtMatmulPreferenceSetAttribute(pref,
+                                                HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                &max_workspace_size,
+                                                sizeof(max_workspace_size)));
+
+      const int                        request_solutions = 1;
+      hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions];
+      int                              returnedAlgoCount = 0;
+      checkHipblasStatus(hipblasLtMatmulAlgoGetHeuristic(ltHandle,
+                                                            matmulDesc,
+                                                            Adesc,
+                                                            Bdesc,
+                                                            Cdesc,
+                                                            Cdesc,
+                                                            pref,
+                                                            request_solutions,
+                                                            heuristicResult,
+                                                            &returnedAlgoCount));
+
+      auto toMalloc = max(heuristicResult[0].workspaceSize, max_workspace_size);
+
+      //printf("\n\n1Got algosn: %d %d %d\n\n",returnedAlgoCount, heuristicResult[0].workspaceSize, toMalloc);
+      //NEED HIP CHECK ERROR
+      auto err = hipMalloc(&d_workspace, toMalloc);
+      //printf("Hipmalloc\n");
+      //printf(hipError_to_string(err).c_str());
+      //printf("\n");
+      has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, toMalloc, 0));
+//hipStreamSynchronize(0);
+      hipFree(d_workspace);
     }
     else
     {
@@ -472,16 +580,47 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_8I, m, n, ldc));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      /* Algo and workspace TODO: need to rework to not be duplicated */
+      // Set User Preference attributes
+      hipblasLtMatmulPreference_t pref;
+      checkHipblasStatus(hipblasLtMatmulPreferenceCreate(&pref));
+      checkHipblasStatus(
+          hipblasLtMatmulPreferenceSetAttribute(pref,
+                                                HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                &max_workspace_size,
+                                                sizeof(max_workspace_size)));
+
+      const int                        request_solutions = 1;
+      hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions];
+      int                              returnedAlgoCount = 0;
+      checkHipblasStatus(hipblasLtMatmulAlgoGetHeuristic(ltHandle,
+                                                            matmulDesc,
+                                                            Adesc,
+                                                            Bdesc,
+                                                            Cdesc,
+                                                            Cdesc,
+                                                            pref,
+                                                            request_solutions,
+                                                            heuristicResult,
+                                                            &returnedAlgoCount));
+
+      //NEED HIP CHECK ERROR
+      hipMalloc(&d_workspace, heuristicResult[0].workspaceSize);
       if(!SCALE_ROWS)
       {
         float alpha = 1.0f, beta = 0.0f;
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, max_workspace_size, 0));
       }
       else
       {
         //has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+	float beta = 0.0f;
+
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, max_workspace_size, 0));
       }
+
+      hipFree(d_workspace);
     }
 
 

From 314d5e0f3d3bce0629aa5aae0b0cfdd17c170c28 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Fri, 10 Nov 2023 02:14:47 -0800
Subject: [PATCH 013/234] init device abstraction

---
 bitsandbytes/autograd/_functions.py | 26 +++++---
 bitsandbytes/cextension.py          | 20 +++---
 bitsandbytes/functional.py          | 94 +++++++++++++++++++++++++++--
 bitsandbytes/nn/modules.py          | 75 ++++++++++++++++++++---
 4 files changed, 179 insertions(+), 36 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index f8403cf24..7f2920bfb 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -223,6 +223,9 @@ def backward(ctx, grad_output):
 
 
 def supports_igemmlt(device: torch.device) -> bool:
+    if device is "cpu":
+        return True
+
     """check if this device supports the optimized int8 kernel"""
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
@@ -267,6 +270,7 @@ class MatmulLtState:
     idx = None
     is_training = True
     has_fp16_weights = True
+
     memory_efficient_backward = False
     use_pool = False
     formatB = F.get_special_format_str()
@@ -294,7 +298,8 @@ class MatMul8bitLt(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
-        using_igemmlt = supports_igemmlt(A.device) and not state.force_no_igemmlt
+        device = A.device
+        using_igemmlt = supports_igemmlt(device) and not state.force_no_igemmlt
         # default of pytorch behavior if inputs are empty
         ctx.is_empty = False
         if prod(A.shape) == 0:
@@ -303,9 +308,9 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             ctx.B = B
             ctx.bias = bias
             if A.shape[-1] == B.shape[0]:
-                return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=A.device)
+                return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=device)
             else:
-                return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device)
+                return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=device)
 
         # 1. Quantize A
         # 2. Quantize B
@@ -318,13 +323,14 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
-        if A.dtype != torch.float16:
+        ctx.cast_dtype = torch.bfloat16 if device is "cpu" else torch.float16
+        if A.dtype != ctx.cast_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
             A = A.reshape(-1, A.shape[-1])
-        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold)
+        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(ctx.cast_dtype), threshold=state.threshold)
 
         if state.threshold > 0.0 and coo_tensorA is not None:
             if state.has_fp16_weights:
@@ -337,7 +343,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             else:
                 if state.CxB is None and using_igemmlt:
                     # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions
-                    # we also need to convert it to the turing/ampere format
+                    # we also need to convert it to the turing/ampere format if using cuda
                     state.CxB, state.SB = F.transform(state.CB, to_order=formatB)
         else:
             if not state.has_fp16_weights and state.CxB is None and using_igemmlt:
@@ -359,7 +365,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
                     state.SCB,
                     state.SCBt,
                     coo_tensorB,
-                ) = F.double_quant(B.to(torch.float16))
+                ) = F.double_quant(B.to(ctx.cast_dtype))
                 if using_igemmlt:
                     state.CxB, state.SB = F.transform(CB, to_order=formatB)
                 else:
@@ -399,7 +405,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if using_igemmlt:
             C32A, SA = F.transform(CA, "col32")
             out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
-            if bias is None or bias.dtype == torch.float16:
+            if bias is None or bias.dtype in [torch.float16, torch.bfloat16]:
                 # we apply the fused bias here
                 output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
                 output = output.to(A.dtype)
@@ -458,7 +464,7 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(ctx.cast_dtype))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
@@ -565,7 +571,7 @@ def matmul(
 
 def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False:
+    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device is "cuda":
         absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
         if A.shape[-1] % blocksize != 0:
             warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index d52a6d607..42fe44387 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -7,13 +7,13 @@
 
 from bitsandbytes.cuda_setup.main import CUDASetup
 
+if torch.cuda.is_available():
+    setup = CUDASetup.get_instance()
+    if setup.initialized != True:
+        setup.run_cuda_setup()
 
-setup = CUDASetup.get_instance()
-if setup.initialized != True:
-    setup.run_cuda_setup()
-
-lib = setup.lib
-try:
+    lib = setup.lib
+ 
     if lib is None and torch.cuda.is_available():
         CUDASetup.get_instance().generate_instructions()
         CUDASetup.get_instance().print_log_stack()
@@ -30,12 +30,10 @@
     lib.get_cusparse.restype = ct.c_void_p
     lib.cget_managed_ptr.restype = ct.c_void_p
     COMPILED_WITH_CUDA = True
-except AttributeError as ex:
-    warn("The installed version of bitsandbytes was compiled without GPU support. "
-        "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.")
-    COMPILED_WITH_CUDA = False
-    print(str(ex))
 
+else:
+    warn("The installed version of bitsandbytes was compiled without GPU support. Will"
+        "run with CPU support")
 
 # print the setup details after checking for errors so we do not print twice
 #if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 96f8ce4e6..b223f0703 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -772,7 +772,7 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize
 def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
 
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+def cuda_quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
 
@@ -1699,7 +1699,7 @@ def batched_igemm(
     return out
 
 
-def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+def cuda_igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     shapeA = SA[0]
     shapeB = SB[0]
     dimsA = len(shapeA)
@@ -1796,7 +1796,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     return out, Sout
 
 
-def mm_dequant(
+def cuda_mm_dequant(
     A,
     quant_state,
     row_stats,
@@ -1980,7 +1980,7 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
 
-def double_quant(
+def cuda_double_quant(
     A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
 ):
     device = A.device
@@ -2076,7 +2076,7 @@ def double_quant(
     return out_row, out_col, row_stats, col_stats, coo_tensor
 
 
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+def cuda_transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
     prev_device = pre_call(A.device)
     if state is None: state = (A.shape, from_order)
     else: from_order = state[1]
@@ -2372,7 +2372,7 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
     return x.to(dtype)
 
 
-def extract_outliers(A, SA, idx):
+def cuda_extract_outliers(A, SA, idx):
     shapeA = SA[0]
     formatA = SA[1]
     assert formatA in ["col_turing", "col_ampere"]
@@ -2402,3 +2402,85 @@ def pipeline_test(A, batch_size):
     out = torch.zeros_like(A)
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
     return out
+
+
+# 8 bits
+def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+        if A.device is "cuda":
+            return cuda_double_quant(A=A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+        if A.device is "cuda":
+            cuda_transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        if A.device is "cuda":
+            cuda_igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+def mm_dequant(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None
+):
+        if A.device is "cuda":
+            cuda_mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+# 4 bits
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+        if A.device is "cuda":
+            cuda_quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+        if A.device is "cuda":
+            cuda_dequantize_4bit(A, quant_state = quant_state, absmax = absmax, out = out, blocksize = blocksize, quant_type=quant_type)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
+
+def extract_outliers(A, SA, idx):
+        if A.device is "cuda":
+            cuda_extract_outliers(A, SA, idx)
+        elif A.device is "cpu":
+            pass
+        elif A.device is "xpu":
+            pass
+        else:
+            pass
\ No newline at end of file
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 3d34bb45f..d024ddd9e 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -152,6 +152,14 @@ def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64,
         self.data = data
         return self
 
+    def cpu(self, device):
+        w = self.data.contiguous().half()
+        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
+        self.data = w_4bit
+        self.quant_state = quant_state
+
+        return self
+
     def cuda(self, device):
         w = self.data.contiguous().half().cuda(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
@@ -160,6 +168,14 @@ def cuda(self, device):
 
         return self
 
+    def xpu(self, device):
+        w = self.data.contiguous().half().to("xpu")
+        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
+        self.data = w_4bit
+        self.quant_state = quant_state
+
+        return self
+
     @overload
     def to(self: T, device: Optional[Union[int, device]] = ..., dtype: Optional[Union[dtype, str]] = ..., non_blocking: bool = ...,) -> T:
         ...
@@ -174,9 +190,15 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
 
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-
-        if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
-            return self.cuda(device)
+    
+        if device is not None and device.type == "cpu":
+            return self.cpu(device)
+
+        if (device is not None and device.type != "cpu" and self.data.device.type == "cpu"):
+            if device.type == "cuda":
+                return self.cuda(device)
+            elif device.type == "xpu":
+                return self.xpu(device)
         else:
             s = self.quant_state
             if s is not None:
@@ -287,6 +309,39 @@ def __new__(
             data = torch.empty(0)
         return torch.Tensor._make_subclass(cls, data, requires_grad)
 
+
+    def cpu(self, device):
+        if self.has_fp16_weights:
+            return super()
+        else:
+            # we store the 8-bit rows-major weight
+            # we convert this weight to the turning/ampere weight during the first inference pass
+            B = self.data.contiguous().half()
+            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+            del CBt
+            del SCBt
+            self.data = CB
+            setattr(self, "CB", CB)
+            setattr(self, "SCB", SCB)
+
+        return self
+
+    def cpu(self, device):
+        if self.has_fp16_weights:
+            return super().to("xpu")
+        else:
+            # we store the 8-bit rows-major weight
+            # we convert this weight to the turning/ampere weight during the first inference pass
+            B = self.data.contiguous().half().to("xpu")
+            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+            del CBt
+            del SCBt
+            self.data = CB
+            setattr(self, "CB", CB)
+            setattr(self, "SCB", SCB)
+
+        return self
+
     def cuda(self, device):
         if self.has_fp16_weights:
             return super().cuda(device)
@@ -325,12 +380,14 @@ def to(self, *args, **kwargs):
             *args, **kwargs
         )
 
-        if (
-            device is not None
-            and device.type == "cuda"
-            and self.data.device.type == "cpu"
-        ):
-            return self.cuda(device)
+        if device is not None and device.type == "cpu":
+            return self.cpu(device)
+
+        if (device is not None and device.type != "cpu" and self.data.device.type == "cpu"):
+            if device.type == "cuda":
+                return self.cuda(device)
+            elif device.type == "xpu":
+                return self.xpu(device)
         else:
             new_param = Int8Params(
                 super().to(

From 524fa5739f7d5e77475112c1e8b74d4d4634a18a Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Wed, 15 Nov 2023 23:05:58 +0000
Subject: [PATCH 014/234] create HIP_ENVIRONMENT variable

---
 bitsandbytes/cextension.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 35c0386b9..c9cbe616f 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -32,6 +32,7 @@
     if torch.version.cuda:
         lib.get_cusparse.restype = ct.c_void_p
     elif torch.version.hip:
+        HIP_ENVIRONMENT = True
         lib.get_hipsparse.restype = ct.c_void_p
 
     lib.cget_managed_ptr.restype = ct.c_void_p

From d7f7a8291aad325e165e3bfc3e3d4ce83453299e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Wed, 15 Nov 2023 23:07:46 +0000
Subject: [PATCH 015/234] Skip failing tests on rocm

---
 tests/test_autograd.py             |  4 +++-
 tests/test_cuda_setup_evaluator.py |  3 ++-
 tests/test_functional.py           | 30 ++++++++++++++++++------------
 tests/test_linear8bitlt.py         |  3 ++-
 tests/test_modules.py              |  5 ++++-
 tests/test_optim.py                |  3 +++
 6 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 803fde145..787005823 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -4,6 +4,7 @@
 import torch
 
 import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 n = 1
 k = 25
@@ -288,7 +289,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
 )
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias",
     values,
@@ -552,6 +553,7 @@ def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose,
 str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose))
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values]
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names)
 def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
     dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 4973da50d..0918c885e 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -9,8 +9,9 @@
     evaluate_cuda_setup,
     extract_candidate_paths,
 )
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_cuda_full_system():
     ## this only tests the cuda version and not compute capability
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
index cc58324e4..44a4e662a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -10,6 +10,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 from scipy.stats import norm
 
 torch.set_printoptions(
@@ -90,7 +91,7 @@ def setup():
 def teardown():
     pass
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dtype", [torch.float32, torch.float16], ids=["float", "half"]
 )
@@ -110,7 +111,7 @@ def test_estimate_quantiles(dtype):
     diff = torch.abs(code - quantiles)
     assert (diff > 5e-02).sum().item() == 0
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_quantile_quantization():
     for i in range(100):
         A1 = torch.randn(1024, 1024, device="cuda")
@@ -153,7 +154,7 @@ def test_dynamic_quantization():
         assert diff < 0.004
 
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("nested", [False, True], ids=["False", "True"])
 @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
 def test_dynamic_blockwise_quantization(nested, blocksize):
@@ -601,6 +602,7 @@ def test_vector_quant(dim1, dim2, dim3):
 names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     if dims == 3 and out_order != "col32":
@@ -684,7 +686,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
     for vals in values
 ]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names)
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for i in range(k):
@@ -732,7 +734,7 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for vals in values
 ]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names)
 def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
     formatB = F.get_special_format_str()
@@ -956,7 +958,7 @@ def test_bench_8bit_training(batch, seq, model, hidden):
 values = list(product(dim1, dim4, dims, formatB, has_bias))
 names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
 def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
     inner = torch.randint(1, 128, size=(1,)).item()
@@ -1109,7 +1111,7 @@ def test_double_quant(dim1, dim2):
 values = list(zip(dim1, dim4, inner))
 names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
 def test_integrated_igemmlt(dim1, dim4, inner):
     for i in range(k):
@@ -1298,7 +1300,7 @@ def test_row_scale_bench(dim1, dim4, inner):
     for vals in values
 ]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",
     values,
@@ -1347,7 +1349,7 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     for vals in values
 ]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
@@ -1408,7 +1410,7 @@ def test_coo_double_quant(dim1, dim2):
 values = list(product(dim1, dim2, transposed_B))
 names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
 def test_spmm_coo(dim1, dim2, transposed_B):
     threshold = 1.5
@@ -1440,6 +1442,7 @@ def test_spmm_coo(dim1, dim2, transposed_B):
         assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30)
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_spmm_bench():
     batch = 2
     model = 1024 * 1
@@ -1489,7 +1492,7 @@ def test_spmm_bench():
 values = list(product(dim1, dim2))
 names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
 def test_integrated_sparse_decomp(dim1, dim2):
     threshold = 3.0
@@ -1672,6 +1675,7 @@ def test_coo2csc():
 names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
 def test_spmm_coo_dequant(dim1, dim2, dtype):
     threshold = 6.0
@@ -2178,6 +2182,7 @@ def test_few_bit_quant():
     #assert False
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_kbit_quantile_estimation():
     for i in range(100):
         data = torch.randn(1024, 1024, device='cuda')
@@ -2220,7 +2225,7 @@ def test_bench_dequantization():
     #print((time.time()-t0)/1e6)
 
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_fp4_quant():
     vals = list(product([0, 1], repeat=4))
 
@@ -2258,6 +2263,7 @@ def test_fp4_quant():
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
 def test_4bit_compressed_stats(quant_type):
     for blocksize in [128, 64]:
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 37f7af9cb..62f863554 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -10,7 +10,7 @@
 from bitsandbytes import functional as F
 from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout
 from bitsandbytes.nn.modules import Linear8bitLt
-
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 # contributed by Alex Borzunov, see:
 # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py
@@ -69,6 +69,7 @@ def test_linear_no_igemmlt():
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt",
                          list(product([False, True], [False, True], [False, True], [False, True])))
 def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt):
diff --git a/tests/test_modules.py b/tests/test_modules.py
index d0a905197..f7d8f5e3f 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -5,7 +5,7 @@
 from torch import nn
 
 import bitsandbytes as bnb
-
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 class MockArgs:
     def __init__(self, initial_data):
@@ -315,6 +315,7 @@ def forward(self, x):
 names = [f"threshold_{vals}" for vals in values]
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("threshold", values, ids=names)
 def test_linear8bitlt_inference(threshold):
     l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half()
@@ -329,6 +330,7 @@ def test_linear8bitlt_inference(threshold):
             assert l1.state.CxB is not None
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_linear8bitlt_accumulated_gradient():
     l1 = torch.nn.Sequential(*[bnb.nn.Linear8bitLt(32, 32).cuda().half() for i in range(2)])
     l2 = torch.nn.Sequential(*[torch.nn.Linear(32, 32).cuda().half() for i in range(2)])
@@ -518,6 +520,7 @@ def test_linear_kbit_fp32_bias(module):
 modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True))
 names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C']
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("module", modules, ids=names)
 def test_kbit_backprop(module):
     b = 17
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 9e90083a9..a785c1ccb 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -13,6 +13,7 @@
 
 import bitsandbytes as bnb
 import bitsandbytes.functional as F
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 # import apex
 
@@ -109,6 +110,7 @@ def rm_path(path):
 optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values]
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer32bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip()
@@ -251,6 +253,7 @@ def test_global_config(dim1, dim2, gtype):
 ]
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip()

From 28b80564b7efbec22eba9800ac4b83ee745a34c1 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Mon, 20 Nov 2023 18:13:42 +0000
Subject: [PATCH 016/234] Add default value for HIP_ENVIRONMENT

---
 bitsandbytes/cextension.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index c9cbe616f..583237bb6 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -29,6 +29,7 @@
     lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False
     lib.get_context.restype = ct.c_void_p
     
+    HIP_ENVIRONMENT = False
     if torch.version.cuda:
         lib.get_cusparse.restype = ct.c_void_p
     elif torch.version.hip:

From 38c934ed151b3f1804e5437e9d4861995a1dde4d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 21 Nov 2023 20:54:17 +0000
Subject: [PATCH 017/234] skip failing triton tests on rocm

---
 tests/test_triton.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_triton.py b/tests/test_triton.py
index e18c7a930..8890193fc 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -4,7 +4,9 @@
 from bitsandbytes.triton.triton_utils import is_triton_available
 from bitsandbytes.nn.triton_based_modules import SwitchBackLinear
 from bitsandbytes.nn import Linear8bitLt
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
                     reason="This test requires triton and a GPU with compute capability 8.0 or higher.")
 @pytest.mark.parametrize("vector_wise_quantization", [False, True])

From 2e9550aa0dcfa6f68dc345e2052f68899e6c8d20 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Tue, 28 Nov 2023 00:37:12 -0800
Subject: [PATCH 018/234] refinement

---
 bitsandbytes/autograd/_functions.py |  6 ++--
 bitsandbytes/functional.py          | 25 ---------------
 bitsandbytes/nn/modules.py          | 48 +++++++----------------------
 3 files changed, 14 insertions(+), 65 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 7f2920bfb..21b814bf5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -322,10 +322,10 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if state.outlier_pool is None:
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
-        # Cast A to fp16
+        # Cast A to fp16 if not on CPU
         ctx.cast_dtype = torch.bfloat16 if device is "cpu" else torch.float16
         if A.dtype != ctx.cast_dtype:
-            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
+            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {ctx.cast_dtype} during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
@@ -460,7 +460,7 @@ def backward(ctx, grad_output):
             # compute grad_bias first before changing grad_output dtype
             grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
 
-        # Cast grad_output to fp16
+        # Cast grad_output
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b223f0703..c52fd0230 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2371,7 +2371,6 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
     x += offset
     return x.to(dtype)
 
-
 def cuda_extract_outliers(A, SA, idx):
     shapeA = SA[0]
     formatA = SA[1]
@@ -2408,20 +2407,12 @@ def pipeline_test(A, batch_size):
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
         if A.device is "cuda":
             return cuda_double_quant(A=A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
         if A.device is "cuda":
             cuda_transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
 
@@ -2447,10 +2438,6 @@ def mm_dequant(
 ):
         if A.device is "cuda":
             cuda_mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
 
@@ -2458,29 +2445,17 @@ def mm_dequant(
 def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
         if A.device is "cuda":
             cuda_quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
 
 def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
         if A.device is "cuda":
             cuda_dequantize_4bit(A, quant_state = quant_state, absmax = absmax, out = out, blocksize = blocksize, quant_type=quant_type)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
 
 def extract_outliers(A, SA, idx):
         if A.device is "cuda":
             cuda_extract_outliers(A, SA, idx)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
         else:
             pass
\ No newline at end of file
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index d024ddd9e..be90c829e 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -153,23 +153,17 @@ def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64,
         return self
 
     def cpu(self, device):
-        w = self.data.contiguous().half()
-        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
-        self.data = w_4bit
-        self.quant_state = quant_state
+        warnings.warn("CPU Params4bit will be soon supported, return raw Params4bit for now")
 
         return self
 
-    def cuda(self, device):
-        w = self.data.contiguous().half().cuda(device)
-        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
-        self.data = w_4bit
-        self.quant_state = quant_state
+    def xpu(self, device):
+        warnings.warn("XPU Params4bit will be soon supported, return raw Params4bit for now")
 
         return self
 
-    def xpu(self, device):
-        w = self.data.contiguous().half().to("xpu")
+    def cuda(self, device):
+        w = self.data.contiguous().half().cuda(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
         self.data = w_4bit
         self.quant_state = quant_state
@@ -311,34 +305,13 @@ def __new__(
 
 
     def cpu(self, device):
-        if self.has_fp16_weights:
-            return super()
-        else:
-            # we store the 8-bit rows-major weight
-            # we convert this weight to the turning/ampere weight during the first inference pass
-            B = self.data.contiguous().half()
-            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
-            del CBt
-            del SCBt
-            self.data = CB
-            setattr(self, "CB", CB)
-            setattr(self, "SCB", SCB)
+        warnings.warn("XPU Int8Params will be soon supported, return raw Int8Params for now")
 
         return self
 
-    def cpu(self, device):
-        if self.has_fp16_weights:
-            return super().to("xpu")
-        else:
-            # we store the 8-bit rows-major weight
-            # we convert this weight to the turning/ampere weight during the first inference pass
-            B = self.data.contiguous().half().to("xpu")
-            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
-            del CBt
-            del SCBt
-            self.data = CB
-            setattr(self, "CB", CB)
-            setattr(self, "SCB", SCB)
+
+    def xpu(self, device):
+        warnings.warn("XPU Int8Params will be soon supported, return raw Int8Params for now")
 
         return self
 
@@ -423,7 +396,8 @@ def __init__(self, input_features, output_features, bias=True, has_fp16_weights=
         self.index = index
 
         self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
+        # fp16 not supports on CPU yet
+        self.state.has_fp16_weights = has_fp16_weights if device is not "cpu" else False
         self.state.memory_efficient_backward = memory_efficient_backward
         if threshold > 0.0 and not has_fp16_weights:
             self.state.use_pool = True

From 68fd024206d1e1012083df131bff2fa34add665c Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 29 Nov 2023 23:53:34 -0800
Subject: [PATCH 019/234] device stepup

---
 bitsandbytes/__init__.py                      |  2 +-
 bitsandbytes/__main__.py                      |  4 +-
 bitsandbytes/autograd/_functions.py           | 13 ++--
 bitsandbytes/cextension.py                    | 22 +++---
 .../cpu}/__init__.py                          |  0
 bitsandbytes/device_setup/cpu/main.py         | 40 ++++++++++
 bitsandbytes/device_setup/cuda/__init__.py    |  0
 .../cuda}/env_vars.py                         |  0
 .../{cuda_setup => device_setup/cuda}/main.py |  2 +-
 bitsandbytes/device_setup/xpu/__init__.py     |  0
 bitsandbytes/device_setup/xpu/main.py         | 15 ++++
 bitsandbytes/functional.py                    | 73 +++++++++++--------
 bitsandbytes/nn/modules.py                    |  5 +-
 tests/test_cuda_setup_evaluator.py            |  2 +-
 14 files changed, 123 insertions(+), 55 deletions(-)
 rename bitsandbytes/{cuda_setup => device_setup/cpu}/__init__.py (100%)
 create mode 100644 bitsandbytes/device_setup/cpu/main.py
 create mode 100644 bitsandbytes/device_setup/cuda/__init__.py
 rename bitsandbytes/{cuda_setup => device_setup/cuda}/env_vars.py (100%)
 rename bitsandbytes/{cuda_setup => device_setup/cuda}/main.py (99%)
 create mode 100644 bitsandbytes/device_setup/xpu/__init__.py
 create mode 100644 bitsandbytes/device_setup/xpu/main.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index f35a3b582..1469e5a10 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from . import cuda_setup, utils, research
+from . import device_setup, utils, research
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py
index 523d02301..0626752dd 100644
--- a/bitsandbytes/__main__.py
+++ b/bitsandbytes/__main__.py
@@ -97,8 +97,8 @@ def print_debug_info() -> None:
 
 
 from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL
-from .cuda_setup.env_vars import to_be_ignored
-from .cuda_setup.main import get_compute_capabilities
+from .device_setup.cuda.env_vars import to_be_ignored
+from .device_setup.cuda.main import get_compute_capabilities
 
 
 print_header("OTHER")
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 21b814bf5..f99f87312 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -223,8 +223,6 @@ def backward(ctx, grad_output):
 
 
 def supports_igemmlt(device: torch.device) -> bool:
-    if device is "cpu":
-        return True
 
     """check if this device supports the optimized int8 kernel"""
     if torch.cuda.get_device_capability(device=device) < (7, 5):
@@ -233,6 +231,10 @@ def supports_igemmlt(device: torch.device) -> bool:
     nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660')  # https://en.wikipedia.org/wiki/GeForce_16_series
     if any(model_name in device_name for model_name in nvidia16_models):
         return False  # these devices are technically cuda 7.5-capable, but they lack tensor cores
+
+    if device == "cpu":
+        return False
+
     return True
 
 
@@ -291,7 +293,6 @@ def tile_indices(self):
             self._tile_indices = get_tile_inds(self.formatB, self.CxB.device)
         return self._tile_indices
 
-
 class MatMul8bitLt(torch.autograd.Function):
     # forward is the same, but we added the fallback for pre-turing GPUs
     # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
@@ -322,8 +323,8 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if state.outlier_pool is None:
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
-        # Cast A to fp16 if not on CPU
-        ctx.cast_dtype = torch.bfloat16 if device is "cpu" else torch.float16
+        # Cast A to fp16
+        ctx.cast_dtype = torch.float16
         if A.dtype != ctx.cast_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {ctx.cast_dtype} during quantization")
 
@@ -571,7 +572,7 @@ def matmul(
 
 def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device is "cuda":
+    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device == "cuda":
         absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
         if A.shape[-1] % blocksize != 0:
             warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 42fe44387..76bfa6647 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -4,16 +4,15 @@
 
 from pathlib import Path
 from warnings import warn
+from bitsandbytes.device_setup.cuda.main import CUDASetup
 
-from bitsandbytes.cuda_setup.main import CUDASetup
+setup = CUDASetup.get_instance()
+if setup.initialized != True:
+    setup.run_cuda_setup()
 
-if torch.cuda.is_available():
-    setup = CUDASetup.get_instance()
-    if setup.initialized != True:
-        setup.run_cuda_setup()
+lib = setup.lib
 
-    lib = setup.lib
- 
+try:
     if lib is None and torch.cuda.is_available():
         CUDASetup.get_instance().generate_instructions()
         CUDASetup.get_instance().print_log_stack()
@@ -30,10 +29,11 @@
     lib.get_cusparse.restype = ct.c_void_p
     lib.cget_managed_ptr.restype = ct.c_void_p
     COMPILED_WITH_CUDA = True
-
-else:
-    warn("The installed version of bitsandbytes was compiled without GPU support. Will"
-        "run with CPU support")
+except AttributeError as ex:
+    warn("The installed version of bitsandbytes was compiled without CUDA GPU support. "
+        "8-bit optimizers, 8-bit multiplication, and CUDA GPU quantization are unavailable.")
+    COMPILED_WITH_CUDA = False
+    print(str(ex))
 
 # print the setup details after checking for errors so we do not print twice
 #if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
diff --git a/bitsandbytes/cuda_setup/__init__.py b/bitsandbytes/device_setup/cpu/__init__.py
similarity index 100%
rename from bitsandbytes/cuda_setup/__init__.py
rename to bitsandbytes/device_setup/cpu/__init__.py
diff --git a/bitsandbytes/device_setup/cpu/main.py b/bitsandbytes/device_setup/cpu/main.py
new file mode 100644
index 000000000..8e44a6ae9
--- /dev/null
+++ b/bitsandbytes/device_setup/cpu/main.py
@@ -0,0 +1,40 @@
+from packaging import version
+import importlib.metadata
+from warnings import warn
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
+    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            package_version = importlib.metadata.version(pkg_name)
+            package_exists = True
+        except importlib.metadata.PackageNotFoundError:
+            package_exists = False
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+_torch_version = "N/A"
+_torch_available = False
+_torch_available, _torch_version = _is_package_available("torch", return_version=True)
+_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+
+def is_ipex_cpu_available():
+    def get_major_and_minor_from_version(full_version):
+        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
+
+    if not _torch_available or not _ipex_available:
+        return False
+
+    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
+    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
+    if torch_major_and_minor != ipex_major_and_minor:
+        warn(
+            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
+            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
+            "Refer to https://intel.github.io/intel-extension-for-pytorch/ for more details."
+        )
+        return False
+    return True
\ No newline at end of file
diff --git a/bitsandbytes/device_setup/cuda/__init__.py b/bitsandbytes/device_setup/cuda/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/bitsandbytes/cuda_setup/env_vars.py b/bitsandbytes/device_setup/cuda/env_vars.py
similarity index 100%
rename from bitsandbytes/cuda_setup/env_vars.py
rename to bitsandbytes/device_setup/cuda/env_vars.py
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/device_setup/cuda/main.py
similarity index 99%
rename from bitsandbytes/cuda_setup/main.py
rename to bitsandbytes/device_setup/cuda/main.py
index f3edf4c73..fd639beb7 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/device_setup/cuda/main.py
@@ -254,7 +254,7 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]:
             1. active conda env
             2. LD_LIBRARY_PATH
             3. any other env vars, while ignoring those that
-                - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
+                - are known to be unrelated (see `bnb.device_setup.cuda.env_vars.to_be_ignored`)
                 - don't contain the path separator `/`
 
         If multiple libraries are found in part 3, we optimistically try one,
diff --git a/bitsandbytes/device_setup/xpu/__init__.py b/bitsandbytes/device_setup/xpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/bitsandbytes/device_setup/xpu/main.py b/bitsandbytes/device_setup/xpu/main.py
new file mode 100644
index 000000000..f13e6cb2d
--- /dev/null
+++ b/bitsandbytes/device_setup/xpu/main.py
@@ -0,0 +1,15 @@
+from .cpu.main import is_ipex_cpu_available
+from warnings import warn
+
+def is_ipex_xpu_available():
+    if is_ipex_cpu_available():
+        import intel_extension_for_pytorch
+    else:
+        return False
+
+    if torch.xpu.is_available():
+        return True
+    else:
+        warn("The installed version of intel_extension_for_pytorch is not supporting XPU device, "
+            " or the XPU device is unavailable.")
+        return False
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index c52fd0230..cfa8b91b0 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,9 +15,26 @@
 from functools import reduce  # Required in Python 3
 from typing import Tuple
 from torch import Tensor
+from warnings import warn
+from .cextension import COMPILED_WITH_CUDA
 
-from .cextension import COMPILED_WITH_CUDA, lib
-
+# CUDA specific lib
+if COMPILED_WITH_CUDA:
+    from .cextension import lib
+
+from bitsandbytes.device_setup.cpu.main import is_ipex_cpu_available
+from bitsandbytes.device_setup.xpu.main import is_ipex_xpu_available
+if not is_ipex_cpu_available():
+    warn(
+        "Intel Extension for PyTorch CPU/XPU supports are not available."
+        "Please refer to https://intel.github.io/intel-extension-for-pytorch/ for installation."
+        )
+else:
+    if not is_ipex_xpu_available():
+        warn(
+            "Intel Extension for PyTorch CPU support is available, while XPU is not."
+            )
+    import intel_extension_for_pytorch as ipex
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -2403,28 +2420,24 @@ def pipeline_test(A, batch_size):
     return out
 
 
-# 8 bits
+# 8 bits functions
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-        if A.device is "cuda":
+        if A.device == "cuda":
             return cuda_double_quant(A=A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
         else:
-            pass
+            raise RuntimeError("double_quant on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-        if A.device is "cuda":
-            cuda_transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+        if A.device == "cuda":
+            return cuda_transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
         else:
-            pass
+            raise RuntimeError("transform on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
 def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-        if A.device is "cuda":
-            cuda_igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
-        elif A.device is "cpu":
-            pass
-        elif A.device is "xpu":
-            pass
+        if A.device == "cuda":
+            return cuda_igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
         else:
-            pass
+            raise RuntimeError("igemmlt on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
 def mm_dequant(
     A,
@@ -2436,26 +2449,26 @@ def mm_dequant(
     new_col_stats=None,
     bias=None
 ):
-        if A.device is "cuda":
+        if A.device == "cuda":
             cuda_mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
         else:
-            pass
+            raise RuntimeError("mm_dequant on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
-# 4 bits
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
-        if A.device is "cuda":
-            cuda_quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
+def extract_outliers(A, SA, idx):
+        if A.device == "cuda":
+            return cuda_extract_outliers(A, SA, idx)
         else:
-            pass
+            raise RuntimeError("extract_outliers on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
-def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-        if A.device is "cuda":
-            cuda_dequantize_4bit(A, quant_state = quant_state, absmax = absmax, out = out, blocksize = blocksize, quant_type=quant_type)
+# 4 bits functions
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+        if A.device == "cuda":
+            return cuda_quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
         else:
-            pass
+            raise RuntimeError("quantize_4bit on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
 
-def extract_outliers(A, SA, idx):
-        if A.device is "cuda":
-            cuda_extract_outliers(A, SA, idx)
+def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+        if A.device == "cuda":
+            return cuda_dequantize_4bit(A, quant_state = quant_state, absmax = absmax, out = out, blocksize = blocksize, quant_type=quant_type)
         else:
-            pass
\ No newline at end of file
+            raise RuntimeError("dequantize_4bit on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
\ No newline at end of file
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index be90c829e..96c3fd326 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -305,7 +305,7 @@ def __new__(
 
 
     def cpu(self, device):
-        warnings.warn("XPU Int8Params will be soon supported, return raw Int8Params for now")
+        warnings.warn("CPU Int8Params will be soon supported, return raw Int8Params for now")
 
         return self
 
@@ -396,8 +396,7 @@ def __init__(self, input_features, output_features, bias=True, has_fp16_weights=
         self.index = index
 
         self.state.threshold = threshold
-        # fp16 not supports on CPU yet
-        self.state.has_fp16_weights = has_fp16_weights if device is not "cpu" else False
+        self.state.has_fp16_weights = has_fp16_weights
         self.state.memory_efficient_backward = memory_efficient_backward
         if threshold > 0.0 and not has_fp16_weights:
             self.state.use_pool = True
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index e875bcd2b..166fb9890 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -15,7 +15,7 @@ def test_manual_override():
     os.environ['CUDA_VERSION']='122'
     assert str(manual_cuda_path) in os.environ['LD_LIBRARY_PATH']
     import bitsandbytes as bnb
-    loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
+    loaded_lib = bnb.device_setup.cuda.main.CUDASetup.get_instance().binary_name
     assert loaded_lib == 'libbitsandbytes_cuda122.so'
 
 

From 65b17a266d6175951e09978b760752c8fbc7cb05 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Thu, 30 Nov 2023 16:05:32 +0800
Subject: [PATCH 020/234] Update modules.py

---
 bitsandbytes/nn/modules.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 7c7c98375..d9a0e7434 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -161,8 +161,7 @@ def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any],
         self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
         self.blocksize = self.quant_state.blocksize
         self.compress_statistics = self.quant_state.nested
-
-        return self
+        self.quant_type = self.quant_state.quant_type
 
     def cpu(self, device):
         warnings.warn("CPU Params4bit will be soon supported, return raw Params4bit for now")

From c5044e01aade4d39192efe6815cfdee446e83bfb Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Fri, 1 Dec 2023 17:11:36 +0800
Subject: [PATCH 021/234] Update bitsandbytes/functional.py

Co-authored-by: Jiong Gong <jiong.gong@intel.com>
---
 bitsandbytes/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index eab555a0b..69035f033 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2542,7 +2542,7 @@ def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None,
         if A.device == "cuda":
             return cuda_double_quant(A=A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
         else:
-            raise RuntimeError("double_quant on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+            raise RuntimeError("double_quant is not supported on non-CUDA devices")
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
         if A.device == "cuda":

From b23789ab77d6c3703360be90aadf361d4c7a8239 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Sun, 3 Dec 2023 09:21:47 -0800
Subject: [PATCH 022/234] add backends

---
 bitsandbytes/autograd/_functions.py           |   17 +-
 bitsandbytes/backends.py                      |   38 +
 .../device_setup/{cpu => }/__init__.py        |    0
 bitsandbytes/device_setup/cpu/main.py         |   40 -
 bitsandbytes/device_setup/xpu/__init__.py     |    0
 bitsandbytes/device_setup/xpu/main.py         |   15 -
 bitsandbytes/functional.py                    | 1050 ++++++++---------
 7 files changed, 533 insertions(+), 627 deletions(-)
 create mode 100644 bitsandbytes/backends.py
 rename bitsandbytes/device_setup/{cpu => }/__init__.py (100%)
 delete mode 100644 bitsandbytes/device_setup/cpu/main.py
 delete mode 100644 bitsandbytes/device_setup/xpu/__init__.py
 delete mode 100644 bitsandbytes/device_setup/xpu/main.py

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 548e6577c..757aafb4f 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -223,7 +223,6 @@ def backward(ctx, grad_output):
 
 
 def supports_igemmlt(device: torch.device) -> bool:
-
     """check if this device supports the optimized int8 kernel"""
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
@@ -231,8 +230,8 @@ def supports_igemmlt(device: torch.device) -> bool:
     nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660')  # https://en.wikipedia.org/wiki/GeForce_16_series
     if any(model_name in device_name for model_name in nvidia16_models):
         return False  # these devices are technically cuda 7.5-capable, but they lack tensor cores
-
     if device == "cpu":
+        #TODO: will return True once CPU backend upstream the supports
         return False
 
     return True
@@ -272,7 +271,6 @@ class MatmulLtState:
     idx = None
     is_training = True
     has_fp16_weights = True
-
     memory_efficient_backward = False
     use_pool = False
     formatB = F.get_special_format_str()
@@ -324,14 +322,13 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
-        ctx.cast_dtype = torch.float16
-        if A.dtype != ctx.cast_dtype:
-            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {ctx.cast_dtype} during quantization")
+        if A.dtype != torch.float16:
+            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
             A = A.reshape(-1, A.shape[-1])
-        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(ctx.cast_dtype), threshold=state.threshold)
+        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold)
 
         if state.threshold > 0.0 and coo_tensorA is not None:
             if state.has_fp16_weights:
@@ -366,7 +363,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
                     state.SCB,
                     state.SCBt,
                     coo_tensorB,
-                ) = F.double_quant(B.to(ctx.cast_dtype))
+                ) = F.double_quant(B.to(torch.float16))
                 if using_igemmlt:
                     state.CxB, state.SB = F.transform(CB, to_order=formatB)
                 else:
@@ -461,11 +458,11 @@ def backward(ctx, grad_output):
             # compute grad_bias first before changing grad_output dtype
             grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
 
-        # Cast grad_output
+        # Cast grad_output to fp16
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(ctx.cast_dtype))
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
diff --git a/bitsandbytes/backends.py b/bitsandbytes/backends.py
new file mode 100644
index 000000000..eb7bda484
--- /dev/null
+++ b/bitsandbytes/backends.py
@@ -0,0 +1,38 @@
+class Backends:
+    """
+    An dict class for device backends that registered with 8bits and 4bits functions.
+
+    The values of this device backends are lowercase strings, e.g., ``"cuda"``. They can
+    be accessed as attributes with key-value, e.g., ``Backends.device["cuda"]``.
+
+    """
+
+    def __init__(self):
+        self.devices = {}
+
+    @classmethod
+    def register_backend(backend_name: str, backend_class):
+        assert backend_name.lower() in {
+            "cpu",
+            "cuda",
+            "xpu",
+        }, "register device backend choices in [cpu, cuda, xpu]"
+
+        # check 8bits or 4bits functionality, at least one is compelete
+        if (
+            hasattr(backend_class, "double_quant")
+            and hasattr(backend_class, "transform")
+            and hasattr(backend_class, "igemmlt")
+            and hasattr(backend_class, "mm_dequant")
+            and hasattr(backend_class, "extract_outliers")
+        ):
+            self.devices[backend_name.lower()] = backend_class
+
+        elif hasattr(backend_class, "quantize_4bit") and hasattr(
+            backend_class, "dequantize_4bit"
+        ):
+            self.devices[backend_name.lower()] = backend_classq
+        else:
+            assert (
+                False
+            ), f"register device backend {backend_name.lower()} but its functionality is not compelete"
diff --git a/bitsandbytes/device_setup/cpu/__init__.py b/bitsandbytes/device_setup/__init__.py
similarity index 100%
rename from bitsandbytes/device_setup/cpu/__init__.py
rename to bitsandbytes/device_setup/__init__.py
diff --git a/bitsandbytes/device_setup/cpu/main.py b/bitsandbytes/device_setup/cpu/main.py
deleted file mode 100644
index 8e44a6ae9..000000000
--- a/bitsandbytes/device_setup/cpu/main.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from packaging import version
-import importlib.metadata
-from warnings import warn
-def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
-    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
-    package_exists = importlib.util.find_spec(pkg_name) is not None
-    package_version = "N/A"
-    if package_exists:
-        try:
-            package_version = importlib.metadata.version(pkg_name)
-            package_exists = True
-        except importlib.metadata.PackageNotFoundError:
-            package_exists = False
-    if return_version:
-        return package_exists, package_version
-    else:
-        return package_exists
-
-_torch_version = "N/A"
-_torch_available = False
-_torch_available, _torch_version = _is_package_available("torch", return_version=True)
-_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
-
-def is_ipex_cpu_available():
-    def get_major_and_minor_from_version(full_version):
-        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
-
-    if not _torch_available or not _ipex_available:
-        return False
-
-    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
-    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
-    if torch_major_and_minor != ipex_major_and_minor:
-        warn(
-            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
-            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
-            "Refer to https://intel.github.io/intel-extension-for-pytorch/ for more details."
-        )
-        return False
-    return True
\ No newline at end of file
diff --git a/bitsandbytes/device_setup/xpu/__init__.py b/bitsandbytes/device_setup/xpu/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/bitsandbytes/device_setup/xpu/main.py b/bitsandbytes/device_setup/xpu/main.py
deleted file mode 100644
index f13e6cb2d..000000000
--- a/bitsandbytes/device_setup/xpu/main.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from .cpu.main import is_ipex_cpu_available
-from warnings import warn
-
-def is_ipex_xpu_available():
-    if is_ipex_cpu_available():
-        import intel_extension_for_pytorch
-    else:
-        return False
-
-    if torch.xpu.is_available():
-        return True
-    else:
-        warn("The installed version of intel_extension_for_pytorch is not supporting XPU device, "
-            " or the XPU device is unavailable.")
-        return False
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 69035f033..c30e1b651 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -19,26 +19,12 @@
 from warnings import warn
 from .cextension import COMPILED_WITH_CUDA
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-
+from backends import Backends
 
 # CUDA specific lib
 if COMPILED_WITH_CUDA:
     from .cextension import lib
 
-from bitsandbytes.device_setup.cpu.main import is_ipex_cpu_available
-from bitsandbytes.device_setup.xpu.main import is_ipex_xpu_available
-if not is_ipex_cpu_available():
-    warn(
-        "Intel Extension for PyTorch CPU/XPU supports are not available."
-        "Please refer to https://intel.github.io/intel-extension-for-pytorch/ for installation."
-        )
-else:
-    if not is_ipex_xpu_available():
-        warn(
-            "Intel Extension for PyTorch CPU support is available, while XPU is not."
-            )
-    import intel_extension_for_pytorch as ipex
-
 # math.prod not compatible with python < 3.8
 def prod(iterable):
     return reduce(operator.mul, iterable, 1)
@@ -908,169 +894,12 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize
 def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
 
-def cuda_quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
-    """
-    Quantize tensor A in blocks of 4-bit values.
-
-    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor with packed 4-bit values.
-    tuple(torch.Tensor, torch.Size, torch.dtype, int):
-        The quantization state to undo the quantization.
-    """
-    if A.device.type != 'cuda':
-        raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
-    if quant_type not in ['fp4', 'nf4']:
-        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
-
-    n = A.numel()
-    input_shape = A.shape
-
-    if absmax is None:
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
-        absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
-
-
-    if out is None:
-        out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
-
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, out, absmax])
-
-    if A.dtype == torch.float32:
-        if quant_type == 'fp4':
-            lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        else:
-            lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-    elif A.dtype == torch.float16:
-        if quant_type == 'fp4':
-            lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        else:
-            lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-    elif A.dtype == torch.bfloat16:
-        if quant_type == 'fp4':
-            lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        else:
-            lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    code = get_4bit_type(quant_type, device=A.device)
-
-    if compress_statistics:
-        offset = absmax.mean()
-        absmax -= offset
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
-        del absmax
-        state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
-    else:
-        state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
-
-    return out, state
-
 def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
 
 def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
 
-def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-    """
-    Dequantizes FP4 blockwise quantized values.
-
-    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor (packed 4-bit values).
-    quant_state : QuantState
-        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        Dequantized output tensor.
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor.
-    """
-    if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
-    if quant_type not in ['fp4', 'nf4']:
-        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
-
-    if quant_state is None:
-        assert absmax is not None and out is not None
-
-        quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
-
-    else:
-        absmax = quant_state.absmax
-
-
-    if quant_state.nested:
-        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-        absmax += quant_state.offset
-        if absmax.dtype != torch.float32: absmax = absmax.float()
-
-    if out is None:
-        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
-
-    n = out.numel()
-
-    device = pre_call(A.device)
-    is_on_gpu([A, absmax, out])
-    if out.dtype == torch.float32:
-        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        else:
-            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-    elif out.dtype == torch.float16:
-        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        else:
-            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-    elif out.dtype == torch.bfloat16:
-        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        else:
-            lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    is_transposed = (True if A.shape[0] == 1 else False)
-    if is_transposed: return out.t()
-    else: return out
-
-
 def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
     if code is None:
         if "dynamic" not in name2qmap:
@@ -1833,198 +1662,6 @@ def batched_igemm(
     return out
 
 
-def cuda_igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    shapeA = SA[0]
-    shapeB = SB[0]
-    dimsA = len(shapeA)
-    dimsB = len(shapeB)
-    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
-    if dimsA == 2:
-        m = shapeA[0]
-    elif dimsA == 3:
-        m = shapeA[0] * shapeA[1]
-
-    rows = n = shapeB[0]
-    assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
-
-    # if the tensor is empty, return a transformed empty tensor with the right dimensions
-    if shapeA[0] == 0 and dimsA == 2:
-        return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
-    elif shapeA[1] == 0 and dimsA == 3:
-        return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
-
-    if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-        )
-    elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-        )
-
-    assert dimsB != 3, "len(B.shape)==3 not supported"
-    assert A.device.type == "cuda"
-    assert B.device.type == "cuda"
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    assert out.dtype == dtype
-    assert SA[1] == "col32"
-    assert SB[1] in ["col_turing", "col_ampere"]
-    assert Sout[1] == "col32"
-    assert (
-        shapeA[-1] == shapeB[-1]
-    ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
-    formatB = SB[1]
-    prev_device = A.device
-    torch.cuda.set_device(A.device)
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-    ptrA = get_ptr(A)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-
-    k = shapeA[-1]
-    lda = ct.c_int32(m * 32)
-    if formatB == "col_turing":
-        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
-    else:
-        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-    ldc = ct.c_int32(m * 32)
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-
-    has_error = 0
-    ptrRowScale = get_ptr(None)
-    is_on_gpu([A, B, out])
-    if formatB == 'col_turing':
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_turing_32(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
-        else:
-            has_error = lib.cigemmlt_turing_8(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
-    elif formatB == "col_ampere":
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_ampere_32(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
-        else:
-            has_error = lib.cigemmlt_ampere_8(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
-
-    if has_error == 1:
-        print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-        raise Exception('cublasLt ran into an error!')
-
-    torch.cuda.set_device(prev_device)
-
-    return out, Sout
-
-
-def cuda_mm_dequant(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None
-):
-    assert A.dtype == torch.int32
-    if bias is not None: assert bias.dtype == torch.float16
-    out_shape = quant_state[0]
-    if len(out_shape) == 3:
-        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
-
-    if out is None:
-        out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
-    if new_row_stats is None:
-        new_row_stats = torch.empty(
-            out_shape[0], dtype=torch.float32, device=A.device
-        )
-    if new_col_stats is None:
-        new_col_stats = torch.empty(
-            out_shape[1], dtype=torch.float32, device=A.device
-        )
-    assert (
-        new_row_stats.shape[0] == row_stats.shape[0]
-    ), f"{new_row_stats.shape} vs {row_stats.shape}"
-    assert (
-        new_col_stats.shape[0] == col_stats.shape[0]
-    ), f"{new_col_stats.shape} vs {col_stats.shape}"
-
-    prev_device = pre_call(A.device)
-    ptrA = get_ptr(A)
-    ptrOut = get_ptr(out)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNewRowStats = get_ptr(new_row_stats)
-    ptrNewColStats = get_ptr(new_col_stats)
-    ptrBias = get_ptr(bias)
-    numRows = ct.c_int32(out_shape[0])
-    numCols = ct.c_int32(out_shape[1])
-
-    is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-    lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
-    post_call(prev_device)
-
-    return out
-
-
-def get_colrow_absmax(
-    A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
-):
-    assert A.dtype == torch.float16
-    device = A.device
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    col_tiles = (cols + 255) // 256
-    tiled_rows = ((rows + 15) // 16) * 16
-    if row_stats is None:
-        row_stats = torch.empty(
-            (rows,), dtype=torch.float32, device=device
-        ).fill_(-50000.0)
-    if col_stats is None:
-        col_stats = torch.empty(
-            (cols,), dtype=torch.float32, device=device
-        ).fill_(-50000.0)
-
-    if nnz_block_ptr is None and threshold > 0.0:
-        nnz_block_ptr = torch.zeros(
-            ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device
-        )
-
-    ptrA = get_ptr(A)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNnzrows = get_ptr(nnz_block_ptr)
-    rows = ct.c_int32(rows)
-    cols = ct.c_int32(cols)
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
-    lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
-    post_call(prev_device)
-
-    if threshold > 0.0:
-        nnz_block_ptr.cumsum_(0)
-
-    return row_stats, col_stats, nnz_block_ptr
 
 
 class COOSparseTensor:
@@ -2113,147 +1750,6 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     values = torch.zeros((nnz,), dtype=dtype, device=device)
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
-
-def cuda_double_quant(
-    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-):
-    device = A.device
-    assert A.dtype == torch.half
-    assert device.type == "cuda"
-    prev_device = pre_call(A.device)
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    if row_stats is None or col_stats is None:
-        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
-            A, threshold=threshold
-        )
-
-    if out_col is None:
-        out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
-    if out_row is None:
-        out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
-
-    coo_tensor = None
-    ptrA = get_ptr(A)
-    ptrColStats = get_ptr(col_stats)
-    ptrRowStats = get_ptr(row_stats)
-    ptrOutCol = get_ptr(out_col)
-    ptrOutRow = get_ptr(out_row)
-
-    is_on_gpu([A, col_stats, row_stats, out_col, out_row])
-    if threshold > 0.0:
-        nnz = nnz_row_ptr[-1].item()
-        if nnz > 0:
-            coo_tensor = coo_zeros(
-                A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
-            )
-            ptrRowIdx = get_ptr(coo_tensor.rowidx)
-            ptrColIdx = get_ptr(coo_tensor.colidx)
-            ptrVal = get_ptr(coo_tensor.values)
-            ptrRowPtr = get_ptr(nnz_row_ptr)
-
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                ptrRowIdx,
-                ptrColIdx,
-                ptrVal,
-                ptrRowPtr,
-                ct.c_float(threshold),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-            val, idx = torch.sort(coo_tensor.rowidx)
-            coo_tensor.rowidx = val
-            coo_tensor.colidx = coo_tensor.colidx[idx]
-            coo_tensor.values = coo_tensor.values[idx]
-        else:
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                None,
-                None,
-                None,
-                None,
-                ct.c_float(0.0),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-    else:
-        lib.cdouble_rowcol_quant(
-            ptrA,
-            ptrRowStats,
-            ptrColStats,
-            ptrOutCol,
-            ptrOutRow,
-            None,
-            None,
-            None,
-            None,
-            ct.c_float(threshold),
-            ct.c_int32(rows),
-            ct.c_int32(cols),
-        )
-    post_call(prev_device)
-
-    return out_row, out_col, row_stats, col_stats, coo_tensor
-
-
-def cuda_transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-    prev_device = pre_call(A.device)
-    if state is None: state = (A.shape, from_order)
-    else: from_order = state[1]
-    if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-    else: new_state = (state[0], to_order) # (shape, order)
-
-    shape = state[0]
-    if len(shape) == 2:
-        dim1 = ct.c_int32(shape[0])
-        dim2 = ct.c_int32(shape[1])
-    else:
-        dim1 = ct.c_int32(shape[0] * shape[1])
-        dim2 = ct.c_int32(shape[2])
-
-    is_on_gpu([A, out])
-    if to_order == 'col32':
-        if transpose:
-            lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_turing":
-        if transpose:
-            lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_ampere":
-        if transpose:
-            lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "row":
-        if from_order == "col_turing":
-            lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif from_order == "col_ampere":
-            lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
-    else:
-        raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
-
-    post_call(prev_device)
-
-    return out, new_state
-
-
 def spmm_coo(cooA, B, out=None):
     if out is None:
         out = torch.empty(
@@ -2505,56 +2001,494 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
     x += offset
     return x.to(dtype)
 
-def cuda_extract_outliers(A, SA, idx):
-    shapeA = SA[0]
-    formatA = SA[1]
-    assert formatA in ["col_turing", "col_ampere"]
-    assert A.device.type == "cuda"
+def pipeline_test(A, batch_size):
+    out = torch.zeros_like(A)
+    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
+    return out
 
-    out = torch.zeros(
-        (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
-    )
+class CUDABackend:
+    @classmethod
+    def double_quant(
+        A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+    ):
+        device = A.device
+        assert A.dtype == torch.half
+        assert device.type == "cuda"
+        prev_device = pre_call(A.device)
 
-    idx_size = ct.c_int32(idx.numel())
-    rows = ct.c_int32(shapeA[0])
-    cols = ct.c_int32(shapeA[1])
-    ptrA = get_ptr(A)
-    ptrIdx = get_ptr(idx)
-    ptrOut = get_ptr(out)
+        cols = A.shape[-1]
+        if len(A.shape) == 3:
+            rows = A.shape[0] * A.shape[1]
+        else:
+            rows = A.shape[0]
 
-    prev_device = pre_call(A.device)
-    if formatA == 'col_turing':
-        lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    elif formatA == "col_ampere":
-        lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    post_call(prev_device)
+        if row_stats is None or col_stats is None:
+            row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
+                A, threshold=threshold
+            )
 
-    return out
+        if out_col is None:
+            out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
+        if out_row is None:
+            out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
+
+        coo_tensor = None
+        ptrA = get_ptr(A)
+        ptrColStats = get_ptr(col_stats)
+        ptrRowStats = get_ptr(row_stats)
+        ptrOutCol = get_ptr(out_col)
+        ptrOutRow = get_ptr(out_row)
+
+        is_on_gpu([A, col_stats, row_stats, out_col, out_row])
+        if threshold > 0.0:
+            nnz = nnz_row_ptr[-1].item()
+            if nnz > 0:
+                coo_tensor = coo_zeros(
+                    A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
+                )
+                ptrRowIdx = get_ptr(coo_tensor.rowidx)
+                ptrColIdx = get_ptr(coo_tensor.colidx)
+                ptrVal = get_ptr(coo_tensor.values)
+                ptrRowPtr = get_ptr(nnz_row_ptr)
+
+                lib.cdouble_rowcol_quant(
+                    ptrA,
+                    ptrRowStats,
+                    ptrColStats,
+                    ptrOutCol,
+                    ptrOutRow,
+                    ptrRowIdx,
+                    ptrColIdx,
+                    ptrVal,
+                    ptrRowPtr,
+                    ct.c_float(threshold),
+                    ct.c_int32(rows),
+                    ct.c_int32(cols),
+                )
+                val, idx = torch.sort(coo_tensor.rowidx)
+                coo_tensor.rowidx = val
+                coo_tensor.colidx = coo_tensor.colidx[idx]
+                coo_tensor.values = coo_tensor.values[idx]
+            else:
+                lib.cdouble_rowcol_quant(
+                    ptrA,
+                    ptrRowStats,
+                    ptrColStats,
+                    ptrOutCol,
+                    ptrOutRow,
+                    None,
+                    None,
+                    None,
+                    None,
+                    ct.c_float(0.0),
+                    ct.c_int32(rows),
+                    ct.c_int32(cols),
+                )
+        else:
+            lib.cdouble_rowcol_quant(
+                ptrA,
+                ptrRowStats,
+                ptrColStats,
+                ptrOutCol,
+                ptrOutRow,
+                None,
+                None,
+                None,
+                None,
+                ct.c_float(threshold),
+                ct.c_int32(rows),
+                ct.c_int32(cols),
+            )
+        post_call(prev_device)
 
-def pipeline_test(A, batch_size):
-    out = torch.zeros_like(A)
-    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out
+        return out_row, out_col, row_stats, col_stats, coo_tensor
+    @classmethod
+    def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+        prev_device = pre_call(A.device)
+        if state is None: state = (A.shape, from_order)
+        else: from_order = state[1]
+        if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
+        else: new_state = (state[0], to_order) # (shape, order)
+
+        shape = state[0]
+        if len(shape) == 2:
+            dim1 = ct.c_int32(shape[0])
+            dim2 = ct.c_int32(shape[1])
+        else:
+            dim1 = ct.c_int32(shape[0] * shape[1])
+            dim2 = ct.c_int32(shape[2])
 
+        is_on_gpu([A, out])
+        if to_order == 'col32':
+            if transpose:
+                lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "col_turing":
+            if transpose:
+                lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "col_ampere":
+            if transpose:
+                lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "row":
+            if from_order == "col_turing":
+                lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
+            elif from_order == "col_ampere":
+                lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
+        else:
+            raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
 
-# 8 bits functions
-def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-        if A.device == "cuda":
-            return cuda_double_quant(A=A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+        post_call(prev_device)
+
+        return out, new_state
+    @classmethod
+    def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        shapeA = SA[0]
+        shapeB = SB[0]
+        dimsA = len(shapeA)
+        dimsB = len(shapeB)
+        assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+        if dimsA == 2:
+            m = shapeA[0]
+        elif dimsA == 3:
+            m = shapeA[0] * shapeA[1]
+
+        rows = n = shapeB[0]
+        assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
+
+        # if the tensor is empty, return a transformed empty tensor with the right dimensions
+        if shapeA[0] == 0 and dimsA == 2:
+            return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
+        elif shapeA[1] == 0 and dimsA == 3:
+            return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
+
+        if dimsA == 2 and out is None:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
+            )
+        elif dimsA == 3 and out is None:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
+            )
+
+        assert dimsB != 3, "len(B.shape)==3 not supported"
+        assert A.device.type == "cuda"
+        assert B.device.type == "cuda"
+        assert A.dtype == torch.int8
+        assert B.dtype == torch.int8
+        assert out.dtype == dtype
+        assert SA[1] == "col32"
+        assert SB[1] in ["col_turing", "col_ampere"]
+        assert Sout[1] == "col32"
+        assert (
+            shapeA[-1] == shapeB[-1]
+        ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
+        formatB = SB[1]
+        prev_device = A.device
+        torch.cuda.set_device(A.device)
+
+        ptr = CUBLAS_Context.get_instance().get_context(A.device)
+        ptrA = get_ptr(A)
+        ptrB = get_ptr(B)
+        ptrC = get_ptr(out)
+
+        k = shapeA[-1]
+        lda = ct.c_int32(m * 32)
+        if formatB == "col_turing":
+            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
         else:
-            raise RuntimeError("double_quant is not supported on non-CUDA devices")
+            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+
+        ldc = ct.c_int32(m * 32)
+        m = ct.c_int32(m)
+        n = ct.c_int32(n)
+        k = ct.c_int32(k)
+
+        has_error = 0
+        ptrRowScale = get_ptr(None)
+        is_on_gpu([A, B, out])
+        if formatB == 'col_turing':
+            if dtype == torch.int32:
+                has_error = lib.cigemmlt_turing_32(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+            else:
+                has_error = lib.cigemmlt_turing_8(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+        elif formatB == "col_ampere":
+            if dtype == torch.int32:
+                has_error = lib.cigemmlt_ampere_32(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+            else:
+                has_error = lib.cigemmlt_ampere_8(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
 
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-        if A.device == "cuda":
-            return cuda_transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+        if has_error == 1:
+            print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
+            raise Exception('cublasLt ran into an error!')
+
+        torch.cuda.set_device(prev_device)
+
+        return out, Sout
+    @classmethod
+    def mm_dequant(
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None
+    ):
+        assert A.dtype == torch.int32
+        if bias is not None: assert bias.dtype == torch.float16
+        out_shape = quant_state[0]
+        if len(out_shape) == 3:
+            out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+        if out is None:
+            out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
+        if new_row_stats is None:
+            new_row_stats = torch.empty(
+                out_shape[0], dtype=torch.float32, device=A.device
+            )
+        if new_col_stats is None:
+            new_col_stats = torch.empty(
+                out_shape[1], dtype=torch.float32, device=A.device
+            )
+        assert (
+            new_row_stats.shape[0] == row_stats.shape[0]
+        ), f"{new_row_stats.shape} vs {row_stats.shape}"
+        assert (
+            new_col_stats.shape[0] == col_stats.shape[0]
+        ), f"{new_col_stats.shape} vs {col_stats.shape}"
+
+        prev_device = pre_call(A.device)
+        ptrA = get_ptr(A)
+        ptrOut = get_ptr(out)
+        ptrRowStats = get_ptr(row_stats)
+        ptrColStats = get_ptr(col_stats)
+        ptrNewRowStats = get_ptr(new_row_stats)
+        ptrNewColStats = get_ptr(new_col_stats)
+        ptrBias = get_ptr(bias)
+        numRows = ct.c_int32(out_shape[0])
+        numCols = ct.c_int32(out_shape[1])
+
+        is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
+        lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
+        post_call(prev_device)
+
+        return out
+    @classmethod
+    def extract_outliers(A, SA, idx):
+        shapeA = SA[0]
+        formatA = SA[1]
+        assert formatA in ["col_turing", "col_ampere"]
+        assert A.device.type == "cuda"
+
+        out = torch.zeros(
+            (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
+        )
+
+        idx_size = ct.c_int32(idx.numel())
+        rows = ct.c_int32(shapeA[0])
+        cols = ct.c_int32(shapeA[1])
+        ptrA = get_ptr(A)
+        ptrIdx = get_ptr(idx)
+        ptrOut = get_ptr(out)
+
+        prev_device = pre_call(A.device)
+        if formatA == 'col_turing':
+            lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+        elif formatA == "col_ampere":
+            lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+        post_call(prev_device)
+
+        return out
+    @classmethod
+    def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+        """
+        Quantize tensor A in blocks of 4-bit values.
+
+        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            The output tensor (8-bit).
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+        Returns
+        -------
+        torch.Tensor:
+            The 8-bit tensor with packed 4-bit values.
+        tuple(torch.Tensor, torch.Size, torch.dtype, int):
+            The quantization state to undo the quantization.
+        """
+        if A.device.type != 'cuda':
+            raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
+        if quant_type not in ['fp4', 'nf4']:
+            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+
+        n = A.numel()
+        input_shape = A.shape
+
+        if absmax is None:
+            blocks = n // blocksize
+            blocks += 1 if n % blocksize > 0 else 0
+            absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+
+
+        if out is None:
+            out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
+
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+
+        prev_device = pre_call(A.device)
+        is_on_gpu([A, out, absmax])
+
+        if A.dtype == torch.float32:
+            if quant_type == 'fp4':
+                lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            else:
+                lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        elif A.dtype == torch.float16:
+            if quant_type == 'fp4':
+                lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            else:
+                lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        elif A.dtype == torch.bfloat16:
+            if quant_type == 'fp4':
+                lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            else:
+                lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
         else:
-            raise RuntimeError("transform on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        post_call(A.device)
 
-def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-        if A.device == "cuda":
-            return cuda_igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
+        code = get_4bit_type(quant_type, device=A.device)
+
+        if compress_statistics:
+            offset = absmax.mean()
+            absmax -= offset
+            qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+            del absmax
+            state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
         else:
-            raise RuntimeError("igemmlt on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+            state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
+
+        return out, state
+    @classmethod
+    def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+        """
+        Dequantizes FP4 blockwise quantized values.
+
+        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input 8-bit tensor (packed 4-bit values).
+        quant_state : QuantState
+            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            Dequantized output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+
+        Returns
+        -------
+        torch.Tensor:
+            Dequantized tensor.
+        """
+        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+        if quant_type not in ['fp4', 'nf4']:
+            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+
+        if quant_state is None:
+            assert absmax is not None and out is not None
+
+            quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
+
+        else:
+            absmax = quant_state.absmax
+
+
+        if quant_state.nested:
+            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+            absmax += quant_state.offset
+            if absmax.dtype != torch.float32: absmax = absmax.float()
+
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+
+        n = out.numel()
+
+        device = pre_call(A.device)
+        is_on_gpu([A, absmax, out])
+        if out.dtype == torch.float32:
+            if quant_state.quant_type == 'fp4':
+                lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            else:
+                lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+        elif out.dtype == torch.float16:
+            if quant_state.quant_type == 'fp4':
+                lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            else:
+                lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+        elif out.dtype == torch.bfloat16:
+            if quant_state.quant_type == 'fp4':
+                lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            else:
+                lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        post_call(A.device)
+
+        is_transposed = (True if A.shape[0] == 1 else False)
+        if is_transposed: return out.t()
+        else: return out
+
+
+Backends.register_backend("cuda", CUDABackend)
+
+# 8 bits common functions
+def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+
+def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+
+def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
 
 def mm_dequant(
     A,
@@ -2566,26 +2500,18 @@ def mm_dequant(
     new_col_stats=None,
     bias=None
 ):
-        if A.device == "cuda":
-            cuda_mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
-        else:
-            raise RuntimeError("mm_dequant on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
 
 def extract_outliers(A, SA, idx):
-        if A.device == "cuda":
-            return cuda_extract_outliers(A, SA, idx)
-        else:
-            raise RuntimeError("extract_outliers on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].extract_outliers(A, SA, idx)
 
-# 4 bits functions
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
-        if A.device == "cuda":
-            return cuda_quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
-        else:
-            raise RuntimeError("quantize_4bit on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
+# 4 bits common functions
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4'):
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
 
-def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-        if A.device == "cuda":
-            return cuda_dequantize_4bit(A, quant_state = quant_state, absmax = absmax, out = out, blocksize = blocksize, quant_type=quant_type)
-        else:
-            raise RuntimeError("dequantize_4bit on non-CUDA devices (CPU, XPU...) will be soon supported but not yet, aborting...")
\ No newline at end of file
+def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4'):
+    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
+    return Backends.device[A.device].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
\ No newline at end of file

From b2a4d54e398a147b6e3ba797934bc023595b8d7a Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Sun, 3 Dec 2023 19:54:15 -0800
Subject: [PATCH 023/234] add quant to device when init weight paam

---
 bitsandbytes/backends.py   | 11 +++------
 bitsandbytes/nn/modules.py | 50 +++++++-------------------------------
 2 files changed, 13 insertions(+), 48 deletions(-)

diff --git a/bitsandbytes/backends.py b/bitsandbytes/backends.py
index eb7bda484..69c2c458c 100644
--- a/bitsandbytes/backends.py
+++ b/bitsandbytes/backends.py
@@ -18,21 +18,18 @@ def register_backend(backend_name: str, backend_class):
             "xpu",
         }, "register device backend choices in [cpu, cuda, xpu]"
 
-        # check 8bits or 4bits functionality, at least one is compelete
+        # check 8bits and 4bits interfaces
         if (
             hasattr(backend_class, "double_quant")
             and hasattr(backend_class, "transform")
             and hasattr(backend_class, "igemmlt")
             and hasattr(backend_class, "mm_dequant")
             and hasattr(backend_class, "extract_outliers")
+            and hasattr(backend_class, "quantize_4bit")
+            and hasattr(backend_class, "dequantize_4bit")
         ):
             self.devices[backend_name.lower()] = backend_class
-
-        elif hasattr(backend_class, "quantize_4bit") and hasattr(
-            backend_class, "dequantize_4bit"
-        ):
-            self.devices[backend_name.lower()] = backend_classq
         else:
             assert (
                 False
-            ), f"register device backend {backend_name.lower()} but its functionality is not compelete"
+            ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index d9a0e7434..5de83e867 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -163,16 +163,8 @@ def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any],
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
 
-    def cpu(self, device):
-        warnings.warn("CPU Params4bit will be soon supported, return raw Params4bit for now")
-        return self
-
-    def xpu(self, device):
-        warnings.warn("XPU Params4bit will be soon supported, return raw Params4bit for now")
-        return self
-
-    def cuda(self, device):
-        w = self.data.contiguous().half().cuda(device)
+    def quantize_to_device(self, device):
+        w = self.data.contiguous().half().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
         self.data = w_4bit
         self.quant_state = quant_state
@@ -194,14 +186,8 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
     
-        if device is not None and device.type == "cpu":
-            return self.cpu(device)
-
-        if (device is not None and device.type != "cpu" and self.data.device.type == "cpu"):
-            if device.type == "cuda":
-                return self.cuda(device)
-            elif device.type == "xpu":
-                return self.xpu(device)
+        if (device is not None and self.data.device.type == "cpu"):
+            return self.quantize_to_device(device)
         else:
             if self.quant_state is not None:
                 self.quant_state.to(device)
@@ -309,25 +295,13 @@ def __new__(
             data = torch.empty(0)
         return torch.Tensor._make_subclass(cls, data, requires_grad)
 
-
-    def cpu(self, device):
-        warnings.warn("CPU Int8Params will be soon supported, return raw Int8Params for now")
-
-        return self
-
-
-    def xpu(self, device):
-        warnings.warn("XPU Int8Params will be soon supported, return raw Int8Params for now")
-
-        return self
-
-    def cuda(self, device):
+    def quantize_to_device(self, device):
         if self.has_fp16_weights:
-            return super().cuda(device)
+            return super().to(device)
         else:
             # we store the 8-bit rows-major weight
             # we convert this weight to the turning/ampere weight during the first inference pass
-            B = self.data.contiguous().half().cuda(device)
+            B = self.data.contiguous().half().to(device)
             CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
             del CBt
             del SCBt
@@ -359,14 +333,8 @@ def to(self, *args, **kwargs):
             *args, **kwargs
         )
 
-        if device is not None and device.type == "cpu":
-            return self.cpu(device)
-
-        if (device is not None and device.type != "cpu" and self.data.device.type == "cpu"):
-            if device.type == "cuda":
-                return self.cuda(device)
-            elif device.type == "xpu":
-                return self.xpu(device)
+        if (device is not None and self.data.device.type == "cpu"):
+            return self.quantize_to_device(device)
         else:
             new_param = Int8Params(
                 super().to(

From c44cf065cc38c898c048ab461c3162879a815406 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Sun, 3 Dec 2023 22:15:15 -0800
Subject: [PATCH 024/234] minor fix

---
 bitsandbytes/functional.py | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index c30e1b651..95295d6a1 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -933,6 +933,50 @@ def dequantize(
     out = dequantize_no_absmax(A, state[1], out)
     return out * state[0]
 
+def get_colrow_absmax(
+    A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
+):
+    assert A.dtype == torch.float16
+    device = A.device
+
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        rows = A.shape[0]
+
+    col_tiles = (cols + 255) // 256
+    tiled_rows = ((rows + 15) // 16) * 16
+    if row_stats is None:
+        row_stats = torch.empty(
+            (rows,), dtype=torch.float32, device=device
+        ).fill_(-50000.0)
+    if col_stats is None:
+        col_stats = torch.empty(
+            (cols,), dtype=torch.float32, device=device
+        ).fill_(-50000.0)
+
+    if nnz_block_ptr is None and threshold > 0.0:
+        nnz_block_ptr = torch.zeros(
+            ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device
+        )
+
+    ptrA = get_ptr(A)
+    ptrRowStats = get_ptr(row_stats)
+    ptrColStats = get_ptr(col_stats)
+    ptrNnzrows = get_ptr(nnz_block_ptr)
+    rows = ct.c_int32(rows)
+    cols = ct.c_int32(cols)
+
+    prev_device = pre_call(A.device)
+    is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
+    lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
+    post_call(prev_device)
+
+    if threshold > 0.0:
+        nnz_block_ptr.cumsum_(0)
+
+    return row_stats, col_stats, nnz_block_ptr
 
 def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
     '''

From 365491a573e340f4c589694537c8b463290375a4 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 00:41:57 -0800
Subject: [PATCH 025/234] mv cuda to common backends

---
 bitsandbytes/backends.py   | 663 ++++++++++++++++++++++++++++++++++++-
 bitsandbytes/functional.py | 564 +++----------------------------
 2 files changed, 706 insertions(+), 521 deletions(-)

diff --git a/bitsandbytes/backends.py b/bitsandbytes/backends.py
index 69c2c458c..e2899e840 100644
--- a/bitsandbytes/backends.py
+++ b/bitsandbytes/backends.py
@@ -1,3 +1,21 @@
+import torch
+from torch import Tensor
+from bitsandbytes.functional import (
+    pre_call,
+    post_call,
+    get_colrow_absmax,
+    get_ptr,
+    is_on_gpu,
+    coo_zeros,
+    get_transform_buffer,
+    prod,
+    get_4bit_type,
+    quantize_blockwise,
+    dequantize_blockwise,
+)
+from bitsandbytes.functional import CUBLAS_Context, QuantState
+
+
 class Backends:
     """
     An dict class for device backends that registered with 8bits and 4bits functions.
@@ -7,11 +25,10 @@ class Backends:
 
     """
 
-    def __init__(self):
-        self.devices = {}
+    devices = {}
 
     @classmethod
-    def register_backend(backend_name: str, backend_class):
+    def register_backend(self, backend_name: str, backend_class):
         assert backend_name.lower() in {
             "cpu",
             "cuda",
@@ -28,8 +45,646 @@ def register_backend(backend_name: str, backend_class):
             and hasattr(backend_class, "quantize_4bit")
             and hasattr(backend_class, "dequantize_4bit")
         ):
-            self.devices[backend_name.lower()] = backend_class
+            Backends.devices[backend_name.lower()] = backend_class
         else:
             assert (
                 False
             ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
+
+
+class CUDABackend:
+    @classmethod
+    def double_quant(
+        A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+    ):
+        device = A.device
+        assert A.dtype == torch.half
+        assert device.type == "cuda"
+        prev_device = pre_call(A.device)
+
+        cols = A.shape[-1]
+        if len(A.shape) == 3:
+            rows = A.shape[0] * A.shape[1]
+        else:
+            rows = A.shape[0]
+
+        if row_stats is None or col_stats is None:
+            row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
+                A, threshold=threshold
+            )
+
+        if out_col is None:
+            out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
+        if out_row is None:
+            out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
+
+        coo_tensor = None
+        ptrA = get_ptr(A)
+        ptrColStats = get_ptr(col_stats)
+        ptrRowStats = get_ptr(row_stats)
+        ptrOutCol = get_ptr(out_col)
+        ptrOutRow = get_ptr(out_row)
+
+        is_on_gpu([A, col_stats, row_stats, out_col, out_row])
+        if threshold > 0.0:
+            nnz = nnz_row_ptr[-1].item()
+            if nnz > 0:
+                coo_tensor = coo_zeros(
+                    A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
+                )
+                ptrRowIdx = get_ptr(coo_tensor.rowidx)
+                ptrColIdx = get_ptr(coo_tensor.colidx)
+                ptrVal = get_ptr(coo_tensor.values)
+                ptrRowPtr = get_ptr(nnz_row_ptr)
+
+                lib.cdouble_rowcol_quant(
+                    ptrA,
+                    ptrRowStats,
+                    ptrColStats,
+                    ptrOutCol,
+                    ptrOutRow,
+                    ptrRowIdx,
+                    ptrColIdx,
+                    ptrVal,
+                    ptrRowPtr,
+                    ct.c_float(threshold),
+                    ct.c_int32(rows),
+                    ct.c_int32(cols),
+                )
+                val, idx = torch.sort(coo_tensor.rowidx)
+                coo_tensor.rowidx = val
+                coo_tensor.colidx = coo_tensor.colidx[idx]
+                coo_tensor.values = coo_tensor.values[idx]
+            else:
+                lib.cdouble_rowcol_quant(
+                    ptrA,
+                    ptrRowStats,
+                    ptrColStats,
+                    ptrOutCol,
+                    ptrOutRow,
+                    None,
+                    None,
+                    None,
+                    None,
+                    ct.c_float(0.0),
+                    ct.c_int32(rows),
+                    ct.c_int32(cols),
+                )
+        else:
+            lib.cdouble_rowcol_quant(
+                ptrA,
+                ptrRowStats,
+                ptrColStats,
+                ptrOutCol,
+                ptrOutRow,
+                None,
+                None,
+                None,
+                None,
+                ct.c_float(threshold),
+                ct.c_int32(rows),
+                ct.c_int32(cols),
+            )
+        post_call(prev_device)
+
+        return out_row, out_col, row_stats, col_stats, coo_tensor
+
+    @classmethod
+    def transform(
+        A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None
+    ):
+        prev_device = pre_call(A.device)
+        if state is None:
+            state = (A.shape, from_order)
+        else:
+            from_order = state[1]
+        if out is None:
+            out, new_state = get_transform_buffer(
+                state[0], A.dtype, A.device, to_order, state[1], transpose
+            )
+        else:
+            new_state = (state[0], to_order)  # (shape, order)
+
+        shape = state[0]
+        if len(shape) == 2:
+            dim1 = ct.c_int32(shape[0])
+            dim2 = ct.c_int32(shape[1])
+        else:
+            dim1 = ct.c_int32(shape[0] * shape[1])
+            dim2 = ct.c_int32(shape[2])
+
+        is_on_gpu([A, out])
+        if to_order == "col32":
+            if transpose:
+                lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "col_turing":
+            if transpose:
+                lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "col_ampere":
+            if transpose:
+                lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
+            else:
+                lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
+        elif to_order == "row":
+            if from_order == "col_turing":
+                lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
+            elif from_order == "col_ampere":
+                lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
+        else:
+            raise NotImplementedError(
+                f"Transform function not implemented: From {from_order} to {to_order}"
+            )
+
+        post_call(prev_device)
+
+        return out, new_state
+
+    @classmethod
+    def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        shapeA = SA[0]
+        shapeB = SB[0]
+        dimsA = len(shapeA)
+        dimsB = len(shapeB)
+        assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
+        if dimsA == 2:
+            m = shapeA[0]
+        elif dimsA == 3:
+            m = shapeA[0] * shapeA[1]
+
+        rows = n = shapeB[0]
+        assert (
+            prod(list(shapeA)) > 0
+        ), f"Input tensor dimensions need to be > 0: {shapeA}"
+
+        # if the tensor is empty, return a transformed empty tensor with the right dimensions
+        if shapeA[0] == 0 and dimsA == 2:
+            return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
+        elif shapeA[1] == 0 and dimsA == 3:
+            return torch.empty(
+                tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16
+            )
+
+        if dimsA == 2 and out is None:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
+            )
+        elif dimsA == 3 and out is None:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
+            )
+
+        assert dimsB != 3, "len(B.shape)==3 not supported"
+        assert A.device.type == "cuda"
+        assert B.device.type == "cuda"
+        assert A.dtype == torch.int8
+        assert B.dtype == torch.int8
+        assert out.dtype == dtype
+        assert SA[1] == "col32"
+        assert SB[1] in ["col_turing", "col_ampere"]
+        assert Sout[1] == "col32"
+        assert (
+            shapeA[-1] == shapeB[-1]
+        ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
+        formatB = SB[1]
+        prev_device = A.device
+        torch.cuda.set_device(A.device)
+
+        ptr = CUBLAS_Context.get_instance().get_context(A.device)
+        ptrA = get_ptr(A)
+        ptrB = get_ptr(B)
+        ptrC = get_ptr(out)
+
+        k = shapeA[-1]
+        lda = ct.c_int32(m * 32)
+        if formatB == "col_turing":
+            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+        else:
+            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+
+        ldc = ct.c_int32(m * 32)
+        m = ct.c_int32(m)
+        n = ct.c_int32(n)
+        k = ct.c_int32(k)
+
+        has_error = 0
+        ptrRowScale = get_ptr(None)
+        is_on_gpu([A, B, out])
+        if formatB == "col_turing":
+            if dtype == torch.int32:
+                has_error = lib.cigemmlt_turing_32(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+            else:
+                has_error = lib.cigemmlt_turing_8(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+        elif formatB == "col_ampere":
+            if dtype == torch.int32:
+                has_error = lib.cigemmlt_ampere_32(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+            else:
+                has_error = lib.cigemmlt_ampere_8(
+                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
+                )
+
+        if has_error == 1:
+            print(
+                f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}"
+            )
+            raise Exception("cublasLt ran into an error!")
+
+        torch.cuda.set_device(prev_device)
+
+        return out, Sout
+
+    @classmethod
+    def mm_dequant(
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None,
+    ):
+        assert A.dtype == torch.int32
+        if bias is not None:
+            assert bias.dtype == torch.float16
+        out_shape = quant_state[0]
+        if len(out_shape) == 3:
+            out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+        if out is None:
+            out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
+        if new_row_stats is None:
+            new_row_stats = torch.empty(
+                out_shape[0], dtype=torch.float32, device=A.device
+            )
+        if new_col_stats is None:
+            new_col_stats = torch.empty(
+                out_shape[1], dtype=torch.float32, device=A.device
+            )
+        assert (
+            new_row_stats.shape[0] == row_stats.shape[0]
+        ), f"{new_row_stats.shape} vs {row_stats.shape}"
+        assert (
+            new_col_stats.shape[0] == col_stats.shape[0]
+        ), f"{new_col_stats.shape} vs {col_stats.shape}"
+
+        prev_device = pre_call(A.device)
+        ptrA = get_ptr(A)
+        ptrOut = get_ptr(out)
+        ptrRowStats = get_ptr(row_stats)
+        ptrColStats = get_ptr(col_stats)
+        ptrNewRowStats = get_ptr(new_row_stats)
+        ptrNewColStats = get_ptr(new_col_stats)
+        ptrBias = get_ptr(bias)
+        numRows = ct.c_int32(out_shape[0])
+        numCols = ct.c_int32(out_shape[1])
+
+        is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
+        lib.cdequant_mm_int32_fp16(
+            ptrA,
+            ptrRowStats,
+            ptrColStats,
+            ptrOut,
+            ptrNewRowStats,
+            ptrNewColStats,
+            ptrBias,
+            numRows,
+            numCols,
+        )
+        post_call(prev_device)
+
+        return out
+
+    @classmethod
+    def extract_outliers(A, SA, idx):
+        shapeA = SA[0]
+        formatA = SA[1]
+        assert formatA in ["col_turing", "col_ampere"]
+        assert A.device.type == "cuda"
+
+        out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
+
+        idx_size = ct.c_int32(idx.numel())
+        rows = ct.c_int32(shapeA[0])
+        cols = ct.c_int32(shapeA[1])
+        ptrA = get_ptr(A)
+        ptrIdx = get_ptr(idx)
+        ptrOut = get_ptr(out)
+
+        prev_device = pre_call(A.device)
+        if formatA == "col_turing":
+            lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+        elif formatA == "col_ampere":
+            lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+        post_call(prev_device)
+
+        return out
+
+    @classmethod
+    def quantize_4bit(
+        A: Tensor,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+    ) -> Tensor:
+        """
+        Quantize tensor A in blocks of 4-bit values.
+
+        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            The output tensor (8-bit).
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+        Returns
+        -------
+        torch.Tensor:
+            The 8-bit tensor with packed 4-bit values.
+        tuple(torch.Tensor, torch.Size, torch.dtype, int):
+            The quantization state to undo the quantization.
+        """
+        if A.device.type != "cuda":
+            raise NotImplementedError(
+                f"Device type not supported for FP4 quantization: {A.device.type}"
+            )
+        if quant_type not in ["fp4", "nf4"]:
+            raise NotImplementedError(
+                f"4-bit quantization data type {quant_type} is not implemented."
+            )
+
+        n = A.numel()
+        input_shape = A.shape
+
+        if absmax is None:
+            blocks = n // blocksize
+            blocks += 1 if n % blocksize > 0 else 0
+            absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+
+        if out is None:
+            out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device)
+
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+
+        prev_device = pre_call(A.device)
+        is_on_gpu([A, out, absmax])
+
+        if A.dtype == torch.float32:
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_fp32_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cquantize_blockwise_fp32_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+        elif A.dtype == torch.float16:
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_fp16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cquantize_blockwise_fp16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+        elif A.dtype == torch.bfloat16:
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_bf16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cquantize_blockwise_bf16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
+        else:
+            raise ValueError(
+                f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}"
+            )
+        post_call(A.device)
+
+        code = get_4bit_type(quant_type, device=A.device)
+
+        if compress_statistics:
+            offset = absmax.mean()
+            absmax -= offset
+            qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+            del absmax
+            state = QuantState(
+                absmax=qabsmax,
+                shape=input_shape,
+                dtype=A.dtype,
+                blocksize=blocksize,
+                code=code,
+                quant_type=quant_type,
+                offset=offset,
+                state2=state2,
+            )
+        else:
+            state = QuantState(
+                absmax=absmax,
+                shape=input_shape,
+                dtype=A.dtype,
+                blocksize=blocksize,
+                code=code,
+                quant_type=quant_type,
+            )
+
+        return out, state
+
+    @classmethod
+    def dequantize_4bit(
+        A: Tensor,
+        quant_state: QuantState = None,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> Tensor:
+        """
+        Dequantizes FP4 blockwise quantized values.
+
+        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input 8-bit tensor (packed 4-bit values).
+        quant_state : QuantState
+            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            Dequantized output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+
+        Returns
+        -------
+        torch.Tensor:
+            Dequantized tensor.
+        """
+        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+            raise ValueError(
+                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+            )
+        if quant_type not in ["fp4", "nf4"]:
+            raise NotImplementedError(
+                f"4-bit quantization data type {quant_type} is not implemented."
+            )
+
+        if quant_state is None:
+            assert absmax is not None and out is not None
+
+            quant_state = QuantState(
+                absmax=absmax,
+                shape=out.shape,
+                dtype=out.dtype,
+                blocksize=blocksize,
+                quant_type=quant_type,
+            )
+
+        else:
+            absmax = quant_state.absmax
+
+        if quant_state.nested:
+            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+            absmax += quant_state.offset
+            if absmax.dtype != torch.float32:
+                absmax = absmax.float()
+
+        if out is None:
+            out = torch.empty(
+                quant_state.shape, dtype=quant_state.dtype, device=A.device
+            )
+
+        n = out.numel()
+
+        device = pre_call(A.device)
+        is_on_gpu([A, absmax, out])
+        if out.dtype == torch.float32:
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_fp32_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cdequantize_blockwise_fp32_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+        elif out.dtype == torch.float16:
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_fp16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cdequantize_blockwise_fp16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+        elif out.dtype == torch.bfloat16:
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_bf16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+            else:
+                lib.cdequantize_blockwise_bf16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
+        else:
+            raise ValueError(
+                f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}"
+            )
+        post_call(A.device)
+
+        is_transposed = True if A.shape[0] == 1 else False
+        if is_transposed:
+            return out.t()
+        else:
+            return out
+
+
+Backends.register_backend("cuda", CUDABackend)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 95295d6a1..178fa8614 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -19,7 +19,6 @@
 from warnings import warn
 from .cextension import COMPILED_WITH_CUDA
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-from backends import Backends
 
 # CUDA specific lib
 if COMPILED_WITH_CUDA:
@@ -887,52 +886,6 @@ def get_4bit_type(typename, device=None, blocksize=64):
 
     return data.to(device)
 
-
-def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
-
-def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
-
-def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
-
-def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
-
-def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    absmax = torch.abs(A).max()
-    if absmax.dtype != torch.float32: absmax = absmax.float()
-    inp = A / absmax
-    out = quantize_no_absmax(inp, code, out)
-    return out, (absmax, code)
-
-
-def dequantize(
-    A: Tensor,
-    state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    code: Tensor = None,
-    out: Tensor = None,
-) -> Tensor:
-    assert state is not None or absmax is not None
-    if code is None and state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    if state is None:
-        state = (absmax, code)
-    out = dequantize_no_absmax(A, state[1], out)
-    return out * state[0]
-
 def get_colrow_absmax(
     A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
 ):
@@ -1035,6 +988,39 @@ def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
     post_call(prev_device)
     return out
 
+def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
+    if code is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+        code = code.to(A.device)
+
+    absmax = torch.abs(A).max()
+    if absmax.dtype != torch.float32: absmax = absmax.float()
+    inp = A / absmax
+    out = quantize_no_absmax(inp, code, out)
+    return out, (absmax, code)
+
+
+def dequantize(
+    A: Tensor,
+    state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    code: Tensor = None,
+    out: Tensor = None,
+) -> Tensor:
+    assert state is not None or absmax is not None
+    if code is None and state is None:
+        if "dynamic" not in name2qmap:
+            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+        code = name2qmap["dynamic"]
+        code = code.to(A.device)
+
+    if state is None:
+        state = (absmax, code)
+    out = dequantize_no_absmax(A, state[1], out)
+    return out * state[0]
+
 
 def optimizer_update_32bit(
     optimizer_name: str,
@@ -2050,476 +2036,8 @@ def pipeline_test(A, batch_size):
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
     return out
 
-class CUDABackend:
-    @classmethod
-    def double_quant(
-        A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-    ):
-        device = A.device
-        assert A.dtype == torch.half
-        assert device.type == "cuda"
-        prev_device = pre_call(A.device)
 
-        cols = A.shape[-1]
-        if len(A.shape) == 3:
-            rows = A.shape[0] * A.shape[1]
-        else:
-            rows = A.shape[0]
-
-        if row_stats is None or col_stats is None:
-            row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
-                A, threshold=threshold
-            )
-
-        if out_col is None:
-            out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
-        if out_row is None:
-            out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
-
-        coo_tensor = None
-        ptrA = get_ptr(A)
-        ptrColStats = get_ptr(col_stats)
-        ptrRowStats = get_ptr(row_stats)
-        ptrOutCol = get_ptr(out_col)
-        ptrOutRow = get_ptr(out_row)
-
-        is_on_gpu([A, col_stats, row_stats, out_col, out_row])
-        if threshold > 0.0:
-            nnz = nnz_row_ptr[-1].item()
-            if nnz > 0:
-                coo_tensor = coo_zeros(
-                    A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
-                )
-                ptrRowIdx = get_ptr(coo_tensor.rowidx)
-                ptrColIdx = get_ptr(coo_tensor.colidx)
-                ptrVal = get_ptr(coo_tensor.values)
-                ptrRowPtr = get_ptr(nnz_row_ptr)
-
-                lib.cdouble_rowcol_quant(
-                    ptrA,
-                    ptrRowStats,
-                    ptrColStats,
-                    ptrOutCol,
-                    ptrOutRow,
-                    ptrRowIdx,
-                    ptrColIdx,
-                    ptrVal,
-                    ptrRowPtr,
-                    ct.c_float(threshold),
-                    ct.c_int32(rows),
-                    ct.c_int32(cols),
-                )
-                val, idx = torch.sort(coo_tensor.rowidx)
-                coo_tensor.rowidx = val
-                coo_tensor.colidx = coo_tensor.colidx[idx]
-                coo_tensor.values = coo_tensor.values[idx]
-            else:
-                lib.cdouble_rowcol_quant(
-                    ptrA,
-                    ptrRowStats,
-                    ptrColStats,
-                    ptrOutCol,
-                    ptrOutRow,
-                    None,
-                    None,
-                    None,
-                    None,
-                    ct.c_float(0.0),
-                    ct.c_int32(rows),
-                    ct.c_int32(cols),
-                )
-        else:
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                None,
-                None,
-                None,
-                None,
-                ct.c_float(threshold),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-        post_call(prev_device)
-
-        return out_row, out_col, row_stats, col_stats, coo_tensor
-    @classmethod
-    def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-        prev_device = pre_call(A.device)
-        if state is None: state = (A.shape, from_order)
-        else: from_order = state[1]
-        if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-        else: new_state = (state[0], to_order) # (shape, order)
-
-        shape = state[0]
-        if len(shape) == 2:
-            dim1 = ct.c_int32(shape[0])
-            dim2 = ct.c_int32(shape[1])
-        else:
-            dim1 = ct.c_int32(shape[0] * shape[1])
-            dim2 = ct.c_int32(shape[2])
-
-        is_on_gpu([A, out])
-        if to_order == 'col32':
-            if transpose:
-                lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
-            else:
-                lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif to_order == "col_turing":
-            if transpose:
-                lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
-            else:
-                lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif to_order == "col_ampere":
-            if transpose:
-                lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
-            else:
-                lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif to_order == "row":
-            if from_order == "col_turing":
-                lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
-            elif from_order == "col_ampere":
-                lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
-
-        post_call(prev_device)
-
-        return out, new_state
-    @classmethod
-    def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-        shapeA = SA[0]
-        shapeB = SB[0]
-        dimsA = len(shapeA)
-        dimsB = len(shapeB)
-        assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
-        if dimsA == 2:
-            m = shapeA[0]
-        elif dimsA == 3:
-            m = shapeA[0] * shapeA[1]
-
-        rows = n = shapeB[0]
-        assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
-
-        # if the tensor is empty, return a transformed empty tensor with the right dimensions
-        if shapeA[0] == 0 and dimsA == 2:
-            return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
-        elif shapeA[1] == 0 and dimsA == 3:
-            return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
-
-        if dimsA == 2 and out is None:
-            out, Sout = get_transform_buffer(
-                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-            )
-        elif dimsA == 3 and out is None:
-            out, Sout = get_transform_buffer(
-                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-            )
-
-        assert dimsB != 3, "len(B.shape)==3 not supported"
-        assert A.device.type == "cuda"
-        assert B.device.type == "cuda"
-        assert A.dtype == torch.int8
-        assert B.dtype == torch.int8
-        assert out.dtype == dtype
-        assert SA[1] == "col32"
-        assert SB[1] in ["col_turing", "col_ampere"]
-        assert Sout[1] == "col32"
-        assert (
-            shapeA[-1] == shapeB[-1]
-        ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
-        formatB = SB[1]
-        prev_device = A.device
-        torch.cuda.set_device(A.device)
-
-        ptr = CUBLAS_Context.get_instance().get_context(A.device)
-        ptrA = get_ptr(A)
-        ptrB = get_ptr(B)
-        ptrC = get_ptr(out)
-
-        k = shapeA[-1]
-        lda = ct.c_int32(m * 32)
-        if formatB == "col_turing":
-            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
-        else:
-            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-        ldc = ct.c_int32(m * 32)
-        m = ct.c_int32(m)
-        n = ct.c_int32(n)
-        k = ct.c_int32(k)
-
-        has_error = 0
-        ptrRowScale = get_ptr(None)
-        is_on_gpu([A, B, out])
-        if formatB == 'col_turing':
-            if dtype == torch.int32:
-                has_error = lib.cigemmlt_turing_32(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
-            else:
-                has_error = lib.cigemmlt_turing_8(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
-        elif formatB == "col_ampere":
-            if dtype == torch.int32:
-                has_error = lib.cigemmlt_ampere_32(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
-            else:
-                has_error = lib.cigemmlt_ampere_8(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
-
-        if has_error == 1:
-            print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-            raise Exception('cublasLt ran into an error!')
-
-        torch.cuda.set_device(prev_device)
-
-        return out, Sout
-    @classmethod
-    def mm_dequant(
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None
-    ):
-        assert A.dtype == torch.int32
-        if bias is not None: assert bias.dtype == torch.float16
-        out_shape = quant_state[0]
-        if len(out_shape) == 3:
-            out_shape = (out_shape[0] * out_shape[1], out_shape[2])
-
-        if out is None:
-            out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
-        if new_row_stats is None:
-            new_row_stats = torch.empty(
-                out_shape[0], dtype=torch.float32, device=A.device
-            )
-        if new_col_stats is None:
-            new_col_stats = torch.empty(
-                out_shape[1], dtype=torch.float32, device=A.device
-            )
-        assert (
-            new_row_stats.shape[0] == row_stats.shape[0]
-        ), f"{new_row_stats.shape} vs {row_stats.shape}"
-        assert (
-            new_col_stats.shape[0] == col_stats.shape[0]
-        ), f"{new_col_stats.shape} vs {col_stats.shape}"
-
-        prev_device = pre_call(A.device)
-        ptrA = get_ptr(A)
-        ptrOut = get_ptr(out)
-        ptrRowStats = get_ptr(row_stats)
-        ptrColStats = get_ptr(col_stats)
-        ptrNewRowStats = get_ptr(new_row_stats)
-        ptrNewColStats = get_ptr(new_col_stats)
-        ptrBias = get_ptr(bias)
-        numRows = ct.c_int32(out_shape[0])
-        numCols = ct.c_int32(out_shape[1])
-
-        is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-        lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
-        post_call(prev_device)
-
-        return out
-    @classmethod
-    def extract_outliers(A, SA, idx):
-        shapeA = SA[0]
-        formatA = SA[1]
-        assert formatA in ["col_turing", "col_ampere"]
-        assert A.device.type == "cuda"
-
-        out = torch.zeros(
-            (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
-        )
-
-        idx_size = ct.c_int32(idx.numel())
-        rows = ct.c_int32(shapeA[0])
-        cols = ct.c_int32(shapeA[1])
-        ptrA = get_ptr(A)
-        ptrIdx = get_ptr(idx)
-        ptrOut = get_ptr(out)
-
-        prev_device = pre_call(A.device)
-        if formatA == 'col_turing':
-            lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-        elif formatA == "col_ampere":
-            lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-        post_call(prev_device)
-
-        return out
-    @classmethod
-    def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
-        """
-        Quantize tensor A in blocks of 4-bit values.
-
-        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input tensor.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            The output tensor (8-bit).
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-        Returns
-        -------
-        torch.Tensor:
-            The 8-bit tensor with packed 4-bit values.
-        tuple(torch.Tensor, torch.Size, torch.dtype, int):
-            The quantization state to undo the quantization.
-        """
-        if A.device.type != 'cuda':
-            raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
-        if quant_type not in ['fp4', 'nf4']:
-            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
-
-        n = A.numel()
-        input_shape = A.shape
-
-        if absmax is None:
-            blocks = n // blocksize
-            blocks += 1 if n % blocksize > 0 else 0
-            absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
-
-
-        if out is None:
-            out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
-
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-
-        prev_device = pre_call(A.device)
-        is_on_gpu([A, out, absmax])
-
-        if A.dtype == torch.float32:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-            else:
-                lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        elif A.dtype == torch.float16:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-            else:
-                lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        elif A.dtype == torch.bfloat16:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-            else:
-                lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-
-        code = get_4bit_type(quant_type, device=A.device)
-
-        if compress_statistics:
-            offset = absmax.mean()
-            absmax -= offset
-            qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
-            del absmax
-            state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
-        else:
-            state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
-
-        return out, state
-    @classmethod
-    def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-        """
-        Dequantizes FP4 blockwise quantized values.
-
-        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input 8-bit tensor (packed 4-bit values).
-        quant_state : QuantState
-            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            Dequantized output tensor.
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-
-        Returns
-        -------
-        torch.Tensor:
-            Dequantized tensor.
-        """
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
-        if quant_type not in ['fp4', 'nf4']:
-            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
-
-        if quant_state is None:
-            assert absmax is not None and out is not None
-
-            quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
-
-        else:
-            absmax = quant_state.absmax
-
-
-        if quant_state.nested:
-            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-            absmax += quant_state.offset
-            if absmax.dtype != torch.float32: absmax = absmax.float()
-
-        if out is None:
-            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
-
-        n = out.numel()
-
-        device = pre_call(A.device)
-        is_on_gpu([A, absmax, out])
-        if out.dtype == torch.float32:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-            else:
-                lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        elif out.dtype == torch.float16:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-            else:
-                lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        elif out.dtype == torch.bfloat16:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-            else:
-                lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-
-        is_transposed = (True if A.shape[0] == 1 else False)
-        if is_transposed: return out.t()
-        else: return out
-
-
-Backends.register_backend("cuda", CUDABackend)
+from bitsandbytes.backends import Backends
 
 # 8 bits common functions
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
@@ -2558,4 +2076,16 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
 
 def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4'):
     assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
\ No newline at end of file
+    return Backends.device[A.device].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
+
+def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
+
+def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
+
+def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
+
+def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')

From 4050fe387e8330f1f5a10735c5651377450ee9fe Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 00:49:06 -0800
Subject: [PATCH 026/234] format fix

---
 bitsandbytes/cextension.py | 3 +++
 bitsandbytes/functional.py | 8 ++------
 bitsandbytes/nn/modules.py | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 76bfa6647..72fbf18f0 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -4,8 +4,10 @@
 
 from pathlib import Path
 from warnings import warn
+
 from bitsandbytes.device_setup.cuda.main import CUDASetup
 
+
 setup = CUDASetup.get_instance()
 if setup.initialized != True:
     setup.run_cuda_setup()
@@ -35,6 +37,7 @@
     COMPILED_WITH_CUDA = False
     print(str(ex))
 
+
 # print the setup details after checking for errors so we do not print twice
 #if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
     #setup.print_log_stack()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 178fa8614..3c74b30cc 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,14 +15,10 @@
 from functools import reduce  # Required in Python 3
 from typing import Tuple, Any, Dict
 from torch import Tensor
-
-from warnings import warn
-from .cextension import COMPILED_WITH_CUDA
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 
-# CUDA specific lib
-if COMPILED_WITH_CUDA:
-    from .cextension import lib
+from .cextension import COMPILED_WITH_CUDA, lib
+
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 5de83e867..9c798fe39 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -185,7 +185,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
 
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-    
+
         if (device is not None and self.data.device.type == "cpu"):
             return self.quantize_to_device(device)
         else:

From 30175d1967a7e52fa5b3aac85b46f9eee5143ac2 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 00:56:18 -0800
Subject: [PATCH 027/234] format fix

---
 bitsandbytes/cextension.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 72fbf18f0..d7088e398 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -13,7 +13,6 @@
     setup.run_cuda_setup()
 
 lib = setup.lib
-
 try:
     if lib is None and torch.cuda.is_available():
         CUDASetup.get_instance().generate_instructions()

From e17549e222b6e6712a4a413e4f2ca2410691ed99 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 01:09:34 -0800
Subject: [PATCH 028/234] use device.type

---
 bitsandbytes/functional.py | 28 ++++++++++++++--------------
 bitsandbytes/nn/modules.py |  1 +
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 3c74b30cc..036026c77 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2037,16 +2037,16 @@ def pipeline_test(A, batch_size):
 
 # 8 bits common functions
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
 
 def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
 
 def mm_dequant(
     A,
@@ -2058,21 +2058,21 @@ def mm_dequant(
     new_col_stats=None,
     bias=None
 ):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
 
 def extract_outliers(A, SA, idx):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].extract_outliers(A, SA, idx)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].extract_outliers(A, SA, idx)
 
 # 4 bits common functions
 def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4'):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
 
 def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4'):
-    assert A.device in Backends.device, f"Device backend for {A.device} is not supported"
-    return Backends.device[A.device].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
+    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
+    return Backends.devices[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
 
 def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 9c798fe39..2bde6b6d2 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -162,6 +162,7 @@ def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any],
         self.blocksize = self.quant_state.blocksize
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
+        return self
 
     def quantize_to_device(self, device):
         w = self.data.contiguous().half().to(device)

From a53bc318efe9109f9dcfd6eda579c4051d5ef813 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 01:16:19 -0800
Subject: [PATCH 029/234] minor fix

---
 bitsandbytes/autograd/_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 757aafb4f..d6e38b1e5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -230,7 +230,7 @@ def supports_igemmlt(device: torch.device) -> bool:
     nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660')  # https://en.wikipedia.org/wiki/GeForce_16_series
     if any(model_name in device_name for model_name in nvidia16_models):
         return False  # these devices are technically cuda 7.5-capable, but they lack tensor cores
-    if device == "cpu":
+    if device.type == "cpu":
         #TODO: will return True once CPU backend upstream the supports
         return False
 

From 80c598c3ca95c9ffcd07410d2f87df07f6479ed4 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 01:41:58 -0800
Subject: [PATCH 030/234] backend refinement

---
 bitsandbytes/backends/__init__.py             | 41 +++++++++++++++++++
 .../{backends.py => backends/cuda.py}         | 39 ------------------
 2 files changed, 41 insertions(+), 39 deletions(-)
 create mode 100644 bitsandbytes/backends/__init__.py
 rename bitsandbytes/{backends.py => backends/cuda.py} (94%)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
new file mode 100644
index 000000000..7a4306e92
--- /dev/null
+++ b/bitsandbytes/backends/__init__.py
@@ -0,0 +1,41 @@
+from .cuda import CUDABackend
+
+
+class Backends:
+    """
+    An dict class for device backends that registered with 8bits and 4bits functions.
+
+    The values of this device backends are lowercase strings, e.g., ``"cuda"``. They can
+    be accessed as attributes with key-value, e.g., ``Backends.device["cuda"]``.
+
+    """
+
+    devices = {}
+
+    @classmethod
+    def register_backend(self, backend_name: str, backend_class):
+        assert backend_name.lower() in {
+            "cpu",
+            "cuda",
+            "xpu",
+        }, "register device backend choices in [cpu, cuda, xpu]"
+
+        # check 8bits and 4bits interfaces
+        if (
+            hasattr(backend_class, "double_quant")
+            and hasattr(backend_class, "transform")
+            and hasattr(backend_class, "igemmlt")
+            and hasattr(backend_class, "mm_dequant")
+            and hasattr(backend_class, "extract_outliers")
+            and hasattr(backend_class, "quantize_4bit")
+            and hasattr(backend_class, "dequantize_4bit")
+        ):
+            Backends.devices[backend_name.lower()] = backend_class
+        else:
+            assert (
+                False
+            ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
+
+
+
+Backends.register_backend("cuda", CUDABackend)
diff --git a/bitsandbytes/backends.py b/bitsandbytes/backends/cuda.py
similarity index 94%
rename from bitsandbytes/backends.py
rename to bitsandbytes/backends/cuda.py
index e2899e840..84b1b70ca 100644
--- a/bitsandbytes/backends.py
+++ b/bitsandbytes/backends/cuda.py
@@ -15,43 +15,6 @@
 )
 from bitsandbytes.functional import CUBLAS_Context, QuantState
 
-
-class Backends:
-    """
-    An dict class for device backends that registered with 8bits and 4bits functions.
-
-    The values of this device backends are lowercase strings, e.g., ``"cuda"``. They can
-    be accessed as attributes with key-value, e.g., ``Backends.device["cuda"]``.
-
-    """
-
-    devices = {}
-
-    @classmethod
-    def register_backend(self, backend_name: str, backend_class):
-        assert backend_name.lower() in {
-            "cpu",
-            "cuda",
-            "xpu",
-        }, "register device backend choices in [cpu, cuda, xpu]"
-
-        # check 8bits and 4bits interfaces
-        if (
-            hasattr(backend_class, "double_quant")
-            and hasattr(backend_class, "transform")
-            and hasattr(backend_class, "igemmlt")
-            and hasattr(backend_class, "mm_dequant")
-            and hasattr(backend_class, "extract_outliers")
-            and hasattr(backend_class, "quantize_4bit")
-            and hasattr(backend_class, "dequantize_4bit")
-        ):
-            Backends.devices[backend_name.lower()] = backend_class
-        else:
-            assert (
-                False
-            ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
-
-
 class CUDABackend:
     @classmethod
     def double_quant(
@@ -686,5 +649,3 @@ def dequantize_4bit(
         else:
             return out
 
-
-Backends.register_backend("cuda", CUDABackend)

From 59facc84c29984adddef5609b6ba7deeb817c614 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 20:23:56 -0800
Subject: [PATCH 031/234] minor fix

---
 bitsandbytes/autograd/_functions.py    | 14 ++++++-------
 bitsandbytes/backends/cuda.py          | 28 +++++++++++++++++++++-----
 bitsandbytes/device_setup/cuda/main.py |  2 +-
 bitsandbytes/functional.py             |  2 +-
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index d6e38b1e5..43668dd82 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -291,14 +291,14 @@ def tile_indices(self):
             self._tile_indices = get_tile_inds(self.formatB, self.CxB.device)
         return self._tile_indices
 
+
 class MatMul8bitLt(torch.autograd.Function):
     # forward is the same, but we added the fallback for pre-turing GPUs
     # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
 
     @staticmethod
     def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
-        device = A.device
-        using_igemmlt = supports_igemmlt(device) and not state.force_no_igemmlt
+        using_igemmlt = supports_igemmlt(A.device) and not state.force_no_igemmlt
         # default of pytorch behavior if inputs are empty
         ctx.is_empty = False
         if prod(A.shape) == 0:
@@ -307,9 +307,9 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             ctx.B = B
             ctx.bias = bias
             if A.shape[-1] == B.shape[0]:
-                return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=device)
+                return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=A.device)
             else:
-                return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=device)
+                return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device)
 
         # 1. Quantize A
         # 2. Quantize B
@@ -341,7 +341,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             else:
                 if state.CxB is None and using_igemmlt:
                     # B in in 8-bit row-major, we can transform it back to 16-bit to extract outlier dimensions
-                    # we also need to convert it to the turing/ampere format if using cuda
+                    # we also need to convert it to the turing/ampere format
                     state.CxB, state.SB = F.transform(state.CB, to_order=formatB)
         else:
             if not state.has_fp16_weights and state.CxB is None and using_igemmlt:
@@ -403,7 +403,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if using_igemmlt:
             C32A, SA = F.transform(CA, "col32")
             out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
-            if bias is None or bias.dtype in [torch.float16, torch.bfloat16]:
+            if bias is None or bias.dtype == torch.float16:
                 # we apply the fused bias here
                 output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
                 output = output.to(A.dtype)
@@ -568,7 +568,7 @@ def matmul(
 
 def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device == "cuda":
+    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type == "cuda":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 84b1b70ca..acf1aa5de 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -1,5 +1,6 @@
 import torch
 from torch import Tensor
+import ctypes as ct
 from bitsandbytes.functional import (
     pre_call,
     post_call,
@@ -14,11 +15,19 @@
     dequantize_blockwise,
 )
 from bitsandbytes.functional import CUBLAS_Context, QuantState
+from bitsandbytes.cextension import lib
+
 
 class CUDABackend:
     @classmethod
     def double_quant(
-        A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+        cls,
+        A,
+        col_stats=None,
+        row_stats=None,
+        out_col=None,
+        out_row=None,
+        threshold=0.0,
     ):
         device = A.device
         assert A.dtype == torch.half
@@ -114,7 +123,14 @@ def double_quant(
 
     @classmethod
     def transform(
-        A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None
+        cls,
+        A,
+        to_order,
+        from_order="row",
+        out=None,
+        transpose=False,
+        state=None,
+        ld=None,
     ):
         prev_device = pre_call(A.device)
         if state is None:
@@ -167,7 +183,7 @@ def transform(
         return out, new_state
 
     @classmethod
-    def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(cls, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         shapeA = SA[0]
         shapeB = SB[0]
         dimsA = len(shapeA)
@@ -271,6 +287,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
 
     @classmethod
     def mm_dequant(
+        cls,
         A,
         quant_state,
         row_stats,
@@ -332,7 +349,7 @@ def mm_dequant(
         return out
 
     @classmethod
-    def extract_outliers(A, SA, idx):
+    def extract_outliers(cls, A, SA, idx):
         shapeA = SA[0]
         formatA = SA[1]
         assert formatA in ["col_turing", "col_ampere"]
@@ -358,6 +375,7 @@ def extract_outliers(A, SA, idx):
 
     @classmethod
     def quantize_4bit(
+        cls,
         A: Tensor,
         absmax: Tensor = None,
         out: Tensor = None,
@@ -509,6 +527,7 @@ def quantize_4bit(
 
     @classmethod
     def dequantize_4bit(
+        cls,
         A: Tensor,
         quant_state: QuantState = None,
         absmax: Tensor = None,
@@ -648,4 +667,3 @@ def dequantize_4bit(
             return out.t()
         else:
             return out
-
diff --git a/bitsandbytes/device_setup/cuda/main.py b/bitsandbytes/device_setup/cuda/main.py
index fd639beb7..cf1cf7796 100644
--- a/bitsandbytes/device_setup/cuda/main.py
+++ b/bitsandbytes/device_setup/cuda/main.py
@@ -125,7 +125,7 @@ def run_cuda_setup(self):
         self.binary_name = binary_name
         self.manual_override()
 
-        package_dir = Path(__file__).parent.parent
+        package_dir = Path(__file__).parent.parent.parent
         binary_path = package_dir / self.binary_name
 
         try:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 036026c77..4cacfc983 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2068,7 +2068,7 @@ def extract_outliers(A, SA, idx):
 # 4 bits common functions
 def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4'):
     assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].quantize_4bit(A, absmax = absmax, out = out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
+    return Backends.devices[A.device.type].quantize_4bit(A, absmax=absmax, out=out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type)
 
 def dequantize_4bit(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4'):
     assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"

From 066d0dc39663b5bebb467e1fd51ac395f8c38bc4 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 4 Dec 2023 22:56:04 -0800
Subject: [PATCH 032/234] final refinement

---
 bitsandbytes/backends/__init__.py |  5 ++---
 bitsandbytes/nn/modules.py        | 22 +++++++++++++---------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 7a4306e92..fd046d506 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -13,7 +13,7 @@ class Backends:
     devices = {}
 
     @classmethod
-    def register_backend(self, backend_name: str, backend_class):
+    def register_backend(cls, backend_name: str, backend_class):
         assert backend_name.lower() in {
             "cpu",
             "cuda",
@@ -30,12 +30,11 @@ def register_backend(self, backend_name: str, backend_class):
             and hasattr(backend_class, "quantize_4bit")
             and hasattr(backend_class, "dequantize_4bit")
         ):
-            Backends.devices[backend_name.lower()] = backend_class
+            cls.devices[backend_name.lower()] = backend_class
         else:
             assert (
                 False
             ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
 
 
-
 Backends.register_backend("cuda", CUDABackend)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2bde6b6d2..ddc40cfa6 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -164,8 +164,8 @@ def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any],
         self.quant_type = self.quant_state.quant_type
         return self
 
-    def quantize_to_device(self, device):
-        w = self.data.contiguous().half().to(device)
+    def cuda(self, device):
+        w = self.data.contiguous().half().cuda(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
         self.data = w_4bit
         self.quant_state = quant_state
@@ -187,8 +187,8 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if (device is not None and self.data.device.type == "cpu"):
-            return self.quantize_to_device(device)
+        if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
+            return self.cuda(device)
         else:
             if self.quant_state is not None:
                 self.quant_state.to(device)
@@ -296,13 +296,13 @@ def __new__(
             data = torch.empty(0)
         return torch.Tensor._make_subclass(cls, data, requires_grad)
 
-    def quantize_to_device(self, device):
+    def cuda(self, device):
         if self.has_fp16_weights:
-            return super().to(device)
+            return super().cuda(device)
         else:
             # we store the 8-bit rows-major weight
             # we convert this weight to the turning/ampere weight during the first inference pass
-            B = self.data.contiguous().half().to(device)
+            B = self.data.contiguous().half().cuda(device)
             CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
             del CBt
             del SCBt
@@ -334,8 +334,12 @@ def to(self, *args, **kwargs):
             *args, **kwargs
         )
 
-        if (device is not None and self.data.device.type == "cpu"):
-            return self.quantize_to_device(device)
+        if (
+            device is not None
+            and device.type == "cuda"
+            and self.data.device.type == "cpu"
+        ):
+            return self.cuda(device)
         else:
             new_param = Int8Params(
                 super().to(

From 657ca4bfbb0642bc430484d9e79fbf7e01449b46 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 21:56:29 +0000
Subject: [PATCH 033/234] Enable col to row transformation

---
 csrc/ops.hip | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 06ff5a0ae..08f6fe122 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -431,6 +431,8 @@ template void transform<int8_t, ROW, COL_TURING, false, 8>(hipblasLtHandle_t ltH
 template void transform<int8_t, ROW, COL_AMPERE, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int8_t, COL32, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int32_t, COL32, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int8_t, COL, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, COL, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
 #endif
 static std::string hipError_to_string(const hipError_t ret)
 {

From a390e0c44ef2585ba1d35131579f6c8f3fa61b03 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:08:06 +0000
Subject: [PATCH 034/234] Add make functions for row to col transformation

---
 csrc/pythonInterface.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index f84e0e8e5..1bb0de395 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -145,6 +145,8 @@ MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
 MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
+MAKE_FUNC_TRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8);
+MAKE_FUNC_TRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32);
 #endif
 
 void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
@@ -381,6 +383,8 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
+	MAKE_FUNC_CTRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8)
+	MAKE_FUNC_CTRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32)
 #endif
 	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
 	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }

From 99ad6b5758ad6b76bc97c74706bdef259e874201 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:11:51 +0000
Subject: [PATCH 035/234] Update get_transform_buffer for row to col in HIP

---
 bitsandbytes/functional.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 91c1ab5e4..988ce4376 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -16,7 +16,7 @@
 from typing import Tuple
 from torch import Tensor
 
-from .cextension import COMPILED_WITH_CUDA, lib
+from .cextension import COMPILED_WITH_CUDA, lib, HIP_ENVIRONMENT
 
 # Remark: for AMD GPU we need to disable blocksize == 64
 
@@ -458,7 +458,10 @@ def get_transform_buffer(
         state = (shape[::-1], to_order)
 
     if to_order == "row" or to_order == "col":
-        return init_func(shape, dtype=dtype, device=device), state
+        if HIP_ENVIRONMENT and to_order == "col":
+            return init_func(shape[::-1], dtype=dtype, device=device), state
+        else:
+            return init_func(shape, dtype=dtype, device=device), state
     elif to_order == "col32":
         # blocks of 32 columns (padded)
         cols = 32 * ((cols + 31) // 32)

From 039b80862c10883962c5e390ee1d3f0aec575f48 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:13:08 +0000
Subject: [PATCH 036/234] Update igemmlt for col format

---
 bitsandbytes/functional.py | 59 +++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 988ce4376..8e17adf99 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1718,13 +1718,23 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
     if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        if HIP_ENVIRONMENT:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col", "row"
+            )
+        else:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
+            )
     elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        if HIP_ENVIRONMENT:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col", "row"
+            )
+        else:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
+            )
 
     assert dimsB != 3, "len(B.shape)==3 not supported"
     assert A.device.type == "cuda"
@@ -1732,9 +1742,14 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     assert A.dtype == torch.int8
     assert B.dtype == torch.int8
     assert out.dtype == dtype
-    assert SA[1] == "col32"
-    assert SB[1] in ["col_turing", "col_ampere"]
-    assert Sout[1] == "col32"
+    if HIP_ENVIRONMENT:
+        assert SA[1] == "col"
+        assert SB[1] == "col"
+        assert Sout[1] == "col"
+    else:
+        assert SA[1] == "col32"
+        assert SB[1] in ["col_turing", "col_ampere"]
+        assert Sout[1] == "col32"
     assert (
         shapeA[-1] == shapeB[-1]
     ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
@@ -1748,17 +1763,21 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     ptrC = get_ptr(out)
 
     k = shapeA[-1]
-    lda = ct.c_int32(m * 32)
-    if formatB == "col_turing":
-        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+    if HIP_ENVIRONMENT:
+        lda = ct.c_int32(m)
+        ldb = ct.c_int32(shapeB[0])
+        ldc = ct.c_int32(m)
     else:
-        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-    ldc = ct.c_int32(m * 32)
+        lda = ct.c_int32(m * 32)
+        if formatB == "col_turing":
+            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+        else:
+            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+        ldc = ct.c_int32(m * 32)
     m = ct.c_int32(m)
     n = ct.c_int32(n)
     k = ct.c_int32(k)
@@ -1766,7 +1785,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     has_error = 0
     ptrRowScale = get_ptr(None)
     is_on_gpu([A, B, out])
-    if formatB == 'col_turing':
+    if formatB == 'col_turing' or HIP_ENVIRONMENT:
         if dtype == torch.int32:
             has_error = lib.cigemmlt_turing_32(
                 ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc

From 1a052ee3bc8b1874fe706cca6316b0949c9ddf44 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:14:02 +0000
Subject: [PATCH 037/234] Unskip test_igemmlt_int on ROCm

---
 tests/test_functional.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 44a4e662a..5166a4f41 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -686,7 +686,6 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names)
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for i in range(k):

From b7ca5cf7c409477896e4fcc6c8493751d4adbca4 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:17:41 +0000
Subject: [PATCH 038/234] Update igemmlt_int test for col inputs

---
 tests/test_functional.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5166a4f41..abbe7dd12 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -702,8 +702,14 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         )
         C1 = torch.matmul(A.float(), B.t().float())
 
-        A2, SA = F.transform(A, "col32")
-        B2, SB = F.transform(B, "col_turing")
+        # col32, col_turing and col_ampere are HW specific and are not applicable to ROCm
+        # using col format instead
+        if HIP_ENVIRONMENT:
+            A2, SA = F.nvidia_transform(A, "col", state=(A.shape,"row"))
+            B2, SB = F.nvidia_transform(B, "col", state=(B.shape,"row"))
+        else:
+            A2, SA = F.transform(A, "col32")
+            B2, SB = F.transform(B, "col_turing")
         C2, SC = F.igemmlt(A2, B2, SA, SB)
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())

From a2cd90d12de88166d73d5a29fb3994b123c887a3 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:19:14 +0000
Subject: [PATCH 039/234] Skip transpose igemmlt test on ROCm

---
 tests/test_functional.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index abbe7dd12..bac6d2058 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -714,16 +714,19 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())
 
-        # transpose
-        B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
-            torch.int8
-        )
-        C1 = torch.matmul(A.float(), B.float())
+        # Since ROCm supports row to col transformation only which is same as transpose,
+        # skipping this for HIP environment
+        if not HIP_ENVIRONMENT:
+            ## transpose
+            B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
+                torch.int8
+            )
+            C1 = torch.matmul(A.float(), B.float())
 
-        B2t, SBt = F.transform(B, "col_turing", transpose=True)
-        C2, SC = F.igemmlt(A2, B2t, SA, SBt)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
-        torch.testing.assert_close(C1, C3.float())
+            B2t, SBt = F.transform(B, "col_turing", transpose=True)
+            C2, SC = F.igemmlt(A2, B2t, SA, SBt)
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+            torch.testing.assert_close(C1, C3.float())
 
 
 dim1 = [32]

From 5b6c5ac3c71ab826b55138cd8bf9db56c5dcacc6 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:34:52 +0000
Subject: [PATCH 040/234] Revert "Update igemmlt_int test for col inputs"

This reverts commit b7ca5cf7c409477896e4fcc6c8493751d4adbca4.
---
 tests/test_functional.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index bac6d2058..048f90e2d 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -702,14 +702,8 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         )
         C1 = torch.matmul(A.float(), B.t().float())
 
-        # col32, col_turing and col_ampere are HW specific and are not applicable to ROCm
-        # using col format instead
-        if HIP_ENVIRONMENT:
-            A2, SA = F.nvidia_transform(A, "col", state=(A.shape,"row"))
-            B2, SB = F.nvidia_transform(B, "col", state=(B.shape,"row"))
-        else:
-            A2, SA = F.transform(A, "col32")
-            B2, SB = F.transform(B, "col_turing")
+        A2, SA = F.transform(A, "col32")
+        B2, SB = F.transform(B, "col_turing")
         C2, SC = F.igemmlt(A2, B2, SA, SB)
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())

From 218bf66212ebfc8c4d3588ea79dfab63defcb2ab Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 22:38:52 +0000
Subject: [PATCH 041/234] Return nvidia_transform from transform for HIP

---
 bitsandbytes/functional.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 8e17adf99..30ff48243 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -489,6 +489,10 @@ def nvidia_transform(
     state=None,
     ld=None,
 ):
+    if HIP_ENVIRONMENT:
+        to_order = "col" if to_order in ["col32","col_turing","col_ampere"]
+        from_order = "col" if from_order in ["col32","col_turing","col_ampere"]
+
     if state is None:
         state = (A.shape, from_order)
     else:
@@ -2094,6 +2098,9 @@ def double_quant(
 
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    if HIP_ENVIRONMENT:
+        return nvidia_transform(A,to_order,from_order,out,transpose,state,ld)
+
     prev_device = pre_call(A.device)
     if state is None: state = (A.shape, from_order)
     else: from_order = state[1]

From 8bb5c2f783351b1e92c8e10c38537d5d2632ce1d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 12 Jan 2024 23:07:55 +0000
Subject: [PATCH 042/234] Fix syntax error

---
 bitsandbytes/functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 30ff48243..a0e67048e 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -490,8 +490,8 @@ def nvidia_transform(
     ld=None,
 ):
     if HIP_ENVIRONMENT:
-        to_order = "col" if to_order in ["col32","col_turing","col_ampere"]
-        from_order = "col" if from_order in ["col32","col_turing","col_ampere"]
+        to_order = "col" if to_order in ["col32","col_turing","col_ampere"] else to_order
+        from_order = "col" if from_order in ["col32","col_turing","col_ampere"] else from_order
 
     if state is None:
         state = (A.shape, from_order)

From eb2edf7e4321cc5cdfc1da79af9d03541e39c92f Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Tue, 16 Jan 2024 11:51:28 -0600
Subject: [PATCH 043/234] Add comment for shape change

---
 bitsandbytes/functional.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index a0e67048e..0ae35acd2 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -459,6 +459,7 @@ def get_transform_buffer(
 
     if to_order == "row" or to_order == "col":
         if HIP_ENVIRONMENT and to_order == "col":
+            # row to col transformation transposes output shape, so change buffer allocation accordingly
             return init_func(shape[::-1], dtype=dtype, device=device), state
         else:
             return init_func(shape, dtype=dtype, device=device), state

From a38ea0fd67ca1d20dea8dc3062f2493e019d2a46 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 16 Jan 2024 21:11:50 +0000
Subject: [PATCH 044/234] Enable nvidia_transform tests

---
 tests/test_functional.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 048f90e2d..c6803dc47 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -594,7 +594,7 @@ def test_vector_quant(dim1, dim2, dim3):
 # dim1, dim2 = (256,), (256,)
 dtype = [torch.int8, torch.int32]
 a_order = ["row"]
-out_order = ["col", "row", "col32"]
+out_order = ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"]
 transpose = [False]
 dims = [2, 3]
 values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))
@@ -602,7 +602,6 @@ def test_vector_quant(dim1, dim2, dim3):
 names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     if dims == 3 and out_order != "col32":
@@ -622,7 +621,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 
     if orderOut == "row":
         torch.testing.assert_close(A.flatten(), out.flatten())
-    elif orderOut == "col":
+    elif orderOut == "col" or (HIP_ENVIRONMENT and orderOut == "col32"):
         torch.testing.assert_close(A.t().flatten(), out.flatten())
     elif orderOut == "col32":
         if dims == 2:

From 67c383bc5bb70c9266e03b517dcf8d8060eb5a75 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 16 Jan 2024 21:13:45 +0000
Subject: [PATCH 045/234] Enable igemmlt_half tests

---
 tests/test_functional.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index c6803dc47..934998a91 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -735,7 +735,6 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names)
 def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
     formatB = F.get_special_format_str()

From 42b860f373b4e84c07aae33f5778b995abedc8fc Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 16 Jan 2024 21:18:21 +0000
Subject: [PATCH 046/234] Revert col32 check in nvidia_transform test

---
 tests/test_functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 934998a91..aec7cdf2d 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -621,7 +621,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 
     if orderOut == "row":
         torch.testing.assert_close(A.flatten(), out.flatten())
-    elif orderOut == "col" or (HIP_ENVIRONMENT and orderOut == "col32"):
+    elif orderOut == "col":
         torch.testing.assert_close(A.t().flatten(), out.flatten())
     elif orderOut == "col32":
         if dims == 2:

From c36085d661fb03ed16c3fa3b4cde7b97d08cb490 Mon Sep 17 00:00:00 2001
From: Zhaoyi Li <36555117+Lzy17@users.noreply.github.com>
Date: Fri, 26 Jan 2024 16:50:06 -0600
Subject: [PATCH 047/234] Update README.md

---
 README.md | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index ff5246750..2058b8482 100644
--- a/README.md
+++ b/README.md
@@ -18,31 +18,35 @@ Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + ROCm >= 5.4.2 or CUDA >
 **Installation**:
 
 
-You need to compile from source. 
+You need to compile from source for ROCm. 
 
 Compilation quickstart:
 ```bash
-git clone https://github.com/Lzy17/bitsandbytes-rocm
-cd bitsandbytes-rocm
+# Run Docker
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --name=bnb_test --shm-size=8g --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --group-add video rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
 
-make hip
-python setup.py install
 
-#to test if you have successfully installed
-python -m bitsandbytes
+# Install Dependencies
+cd <workspace>
+git clone --recurse https://github.com/ROCmSoftwarePlatform/hipBLASLt
+cd hipBLASLt
+git checkout 4b3b34405e7e25cff404f69bfd0a832644430477
+./install.sh -idc
+ 
+cd ..
+pip install einops lion_pytorch
+
 
-#To be benchmarks accuray benchmark from https://github.com/TimDettmers/bitsandbytes/issues/565
-cd benchmarking/accuracy
-python bnb_accuracy.py
+# Install BitsandBytes
+git clone --recurse https://github.com/ROCmSoftwarePlatform/bitsandbytes
+cd bitsandbytes
+git checkout rocm_enabled
+make hip
+python setup.py install
 
-#Accurate results should looks like
-#tensor(526.7872, device='cuda:0')
-#tensor(551.2297, device='cuda:0')
-#tensor(574.9075, device='cuda:0')
-#tensor(3435.1819, device='cuda:0')
-#tensor(3480.1541, device='cuda:0')
 
-#
+# Run the unit test. If it runs successfully, the library has been installed successfully.
+pytest -vvv ./tests/ 2>&1 | tee BitsAndBytes_UT_summary.log
 ```
 
 **Using Int8 inference with HuggingFace Transformers**

From 0e91e481643a82a5d7c269feb7108fa6a56ff760 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Fri, 26 Jan 2024 23:56:22 +0000
Subject: [PATCH 048/234] Update hip files with upstream changes

---
 bitsandbytes/cuda_setup/main.py |   2 +-
 csrc/kernels.hip                | 278 ++++++++++++++++++++++++++++----
 csrc/kernels.hiph               |   2 +
 csrc/ops.hip                    |  37 ++++-
 csrc/ops.hiph                   |   1 +
 csrc/pythonInterface.c          |  35 ++--
 6 files changed, 299 insertions(+), 56 deletions(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index fe24d7ffb..b4962c1a0 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -345,7 +345,7 @@ def evaluate_cuda_setup():
               ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
         cuda_setup.add_log_entry('='*80)
     if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None
-    if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None, None
+    if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None
 
     cudart_path = determine_cuda_runtime_lib_path()
     ccs = get_compute_capabilities()
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 019024014..64a93cc6e 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -198,6 +198,7 @@ __device__ half dhDequantizeNF4(unsigned char val)
 
 __device__ float dDequantizeNF4(unsigned char val)
 {
+
   // the values for this tree was generated by test_normal_map_tree
   // in the file tests/test_functional.py
   if((val & 0b1000) == 8)
@@ -2153,7 +2154,12 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   {
     //smem_col_absmax_values[threadIdx.x + (j*THREADS)] = -FLT_MAX;
     smem_row_absmax_values[threadIdx.x + (j*THREADS)] = -FLT_MAX;
-    smem_row_nnz_values[threadIdx.x + (j*THREADS)] = 0;
+    // smem_row_nnz_values[threadIdx.x + (j*THREADS)] = 0;
+  }
+
+  #pragma unroll TILE_ROWS
+  for (int j = 0; j < TILE_ROWS; j++) {
+    smem_row_nnz_values[j] = 0;
   }
 
   #pragma unroll ITEMS_PER_THREAD
@@ -3064,7 +3070,7 @@ template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_l
     }
 }
 
-#define WARPS 5
+#define WARPS 3
 template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc)
 {
 
@@ -3273,6 +3279,18 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
 #endif
 }
 
+
+template <typename T> __device__ void printnonzero(T *A, int num_values, const char * strval)
+{
+  for(int i = 0; i < num_values; i++)
+    if((float)A[i] != 0.0)
+      printf("%s %i %f\n", strval, i, (float)A[i]);
+}
+
+template __device__ void printnonzero<float>(float *A, int num_values, const char*strval);
+template __device__ void printnonzero<half>(half *A, int num_values, const char*strval);
+
+__device__ static float nf4_data[16] = {-1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453, -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0, 0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224, 0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0};
 template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
 {
 
@@ -3280,26 +3298,40 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 	using namespace nvcuda;
   int col_offset = blockIdx.x *32;
   const int warp_id = threadIdx.x / 32;
+  const int warp_idx = threadIdx.x % 32;
   const int half_warp_id = threadIdx.x / 16;
   const int half_warp_lane = threadIdx.x % 16;
   const int batch_size_warps = (WARPS-1)*2;
 
+  T quant_map[16];
+
+  #pragma unroll 16
+  for(int i = 0; i < 16; i++)
+    quant_map[i] = nf4_data[i];
+  //__shared__ T quant_map[16*160];
+
   T local_A[2];
   T local_B[64];
   unsigned char local_B_4bit[32];
 
+
   const int a_tile_offset = 16;
   const int b_tile_offset = (16*32 + 16);
 
-  __shared__ T smem_A[8*16 + (2*16*(batch_size_warps-1))];
+  __shared__ T smem_A[8*16 + (16*(batch_size_warps-1))];
   __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
-  //__shared__ T smem_C[8*32];
+  __shared__ T smem_C[8*32];
 
    wmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
    wmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
    wmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
    wmma::fill_fragment(c_frag, 0.0f);
 
+  for(int i = threadIdx.x; i < (8*32); i+=blockDim.x)
+    smem_C[i] = 0.0f;
+
+  __syncthreads();
+
   int ticktock = 0;
   int idx = 0 + threadIdx.x;
   int loaded_values = 0;
@@ -3325,8 +3357,17 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
       #pragma unroll 64
       for(int col = 0; col < 64; col+=2)
       {
-        local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(1.0f);
-        local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(1.0f);
+        //local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(1.0f);
+        //local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(1.0f);
+        //local_B[col] = d2DequantizeFP4(local_B_4bit[col/2] >> 4)*(float)(17.0);
+        //local_B[col+1] = d2DequantizeFP4(local_B_4bit[col/2] & 0x0F)*(float)(17.0);
+        //local_B[col] = 127*(local_B_4bit[col/2] >> 4)*(float)(17.0);
+        //local_B[col+1] = 127*(local_B_4bit[col/2] & 0x0F)*(float)(17.0);
+
+        //local_B[col] = quant_map[(local_B_4bit[col/2] >> 4)]*T(17.0);
+        //local_B[col+1] = quant_map[(local_B_4bit[col/2] & 0x0F)]*T(17.0);
+        local_B[col] = quant_map[160*(local_B_4bit[col/2] >> 4)+warp_idx]*T(17.0);
+        local_B[col+1] = quant_map[160*(local_B_4bit[col/2] & 0x0F)+warp_idx]*T(17.0);
       }
     }
 
@@ -3350,13 +3391,17 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
       smem_B[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*b_tile_offset) + (col*16)] = 0.0f;
   }
   ticktock = ticktock == 0 ? 1 : 0;
+    //if(threadIdx.x == 0)
+      //printf("aa %i %i\n", idx, loaded_values);
 
   //for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
   for(int base_idx = blockDim.x-32; base_idx < K; base_idx+=blockDim.x-32)
   {
     idx = base_idx + threadIdx.x;
+    //if(threadIdx.x == 0)
+      //printf("%i %i\n", idx, loaded_values);
 
-    __syncthreads();
+    //__syncthreads();
     if(idx < K && warp_id < (WARPS-1))
     {
       if(loaded_values == 0)
@@ -3384,9 +3429,17 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
         #pragma unroll 64
         for(int col = 0; col < 64; col+=2)
         {
-          local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(absidx);
-          local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(absidx);
-        }
+          //local_B[col] = dhDequantizeNF4(local_B_4bit[col/2] >> 4)*T(absidx);
+          //local_B[col+1] = dhDequantizeNF4(local_B_4bit[col/2] & 0x0F)*T(absidx);
+          //local_B[col] = T(127)*T(local_B_4bit[col/2] >> 4)*T(absidx);
+          //local_B[col+1] = T(127)*T(local_B_4bit[col/2] & 0x0F)*T(absidx);
+
+          //local_B[col] = quant_map[160*(local_B_4bit[col/2] >> 4)+warp_idx]*T(local_absmax);
+          //local_B[col+1] = quant_map[160*(local_B_4bit[col/2] & 0x0F)+warp_idx]*T(local_absmax);
+          local_B[col] = quant_map[(local_B_4bit[col/2] >> 4)]*T(absidx);
+          local_B[col+1] = quant_map[(local_B_4bit[col/2] & 0x0F)]*T(absidx);
+	}
+        //printnonzero<T>(local_B, 128, "");
       }
 
       smem_A[half_warp_lane + (((batch_size_warps*ticktock)+half_warp_id)*a_tile_offset)] = local_A[0];
@@ -3420,6 +3473,11 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
   }
 
   __syncthreads();
+  //if(threadIdx.x == 0)
+  //{
+  //  printnonzero<T>(smem_A, 8*16 + (2*16*(batch_size_warps-1)), "A: ");
+  //  printnonzero<T>(smem_B, 2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1)), "B: ");
+  //}
   if(warp_id != (WARPS-1)){ return; }
   // only warp_id == (WARPS-1) from here
   int warp_lane = threadIdx.x % 32;
@@ -3427,6 +3485,8 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
   ticktock = ticktock == 0 ? 1 : 0;
   for(int k = 0; k < batch_size_warps; k++)
   {
+    //if(warp_lane == 0)
+      //printf("%i %i %i %i\n", (ticktock*batch_size_warps + k)*a_tile_offset, k, ticktock, threadIdx.x);
     wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
     wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
     wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
@@ -3434,13 +3494,135 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 
   // 129 mu
   if(warp_id == (WARPS-1))
-    wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
+    wmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major);
+
+  //printnonzero<T>(smem_C, 32, "");
 
   if(col_offset + warp_lane < M)
-    out[col_offset + warp_lane] = smem_A[warp_lane];
+    out[col_offset + warp_lane] = smem_C[warp_lane];
 #endif
 }
 
+#define num_values_4bit 32
+template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
+{
+
+  // per threadblock: 
+  // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
+  // 4 warps -> 4 loads per iter
+  // 1x32 * 32x4 -> 1x4 outputs per thread block
+  typedef hipcub::WarpReduce<float> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/32];
+
+  const int warp_idx = threadIdx.x / 32;
+  const int warp_lane = threadIdx.x % 32;
+  const int row_B = (THREADS/32)*blockIdx.x + warp_idx;
+  const int num_values_8bit = num_values_4bit/2;
+  float local_C = 0.0f;
+
+  unsigned char local_B_4bit[num_values_8bit];
+  T local_B[num_values_4bit/4];
+  T local_A[num_values_4bit/4];
+  __shared__ T quant_map[16];
+	T local_absmax = T(0.0f);
+
+  for(int i = threadIdx.x; i < 16; i++)
+    quant_map[i] = T(datatype[i]);
+  __syncthreads();
+
+  // A: [1, K]
+  // B: [N, K]
+  for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += 32*num_values_4bit)
+  {
+    int inner_idx_halved = inner_idx/2;
+    int offset_B = ldb*row_B;
+    int absidx = ((2*offset_B)+inner_idx)/blocksize;
+	  local_absmax = __ldg(&(absmax[absidx]));
+
+    if(row_B < M)
+    {
+      if((inner_idx_halved + num_values_8bit) < (K/2))
+      {
+        // this is the most important for performance considerations
+        reinterpret_cast<int4(&)[num_values_8bit]>(local_B_4bit)[0] = reinterpret_cast<int4*>(B)[(offset_B+(inner_idx_halved))/(num_values_8bit)];
+      }
+      else
+      {
+        #pragma unroll
+        for(int j = 0; j < (num_values_8bit); j++)
+          if((inner_idx_halved) + j < (K/2))
+            local_B_4bit[j] = B[offset_B+inner_idx_halved + j];
+          else
+            local_B_4bit[j] = 0b01110111;
+      }
+    }
+    else
+    {
+      #pragma unroll
+      for(int j = 0; j < (num_values_8bit); j++)
+          local_B_4bit[j] = 0b01110111;
+    }
+
+    for(int i = 0; i < 4; i++)
+    {
+      #pragma unroll
+      for(int k = 0; k < num_values_8bit/4; k++)
+      {
+        #if __CUDA_ARCH__ >= 800
+          local_B[k*2] = quant_map[local_B_4bit[(i*num_values_8bit/4) + k] >> 4]*local_absmax;
+          local_B[k*2 + 1] = quant_map[local_B_4bit[(i*num_values_8bit/4) + k] & 0x0F]*local_absmax;
+        #else
+          // bf16 multipliation not supported
+          local_B[k*2] = T((float)quant_map[local_B_4bit[(i*num_values_8bit/4) + k] >> 4]*(float)local_absmax);
+          local_B[k*2 + 1] = T((float)quant_map[local_B_4bit[(i*num_values_8bit/4) + k] & 0x0F]*(float)local_absmax);
+        #endif
+      }
+
+      if(inner_idx+(num_values_4bit/4) + (i*num_values_4bit/4) < K)
+      {
+        // this is also relatively important for performance
+        if(BITS==16)
+        {
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + i];
+        }
+        else
+        {
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + (2*i) + 0];
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[1] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + (2*i) + 1];
+        }
+
+      }
+      else
+        #pragma unroll
+        for(int k = 0; k < num_values_4bit/4; k++)
+          if(inner_idx + (i*num_values_4bit/4) + k < K)
+            local_A[k] = A[inner_idx + k + (i*num_values_4bit/4)];
+          else
+            local_A[k] = T(0.0f);
+
+
+      // accumulate in float; small performance hit for Ampere, but lower error for outputs
+      #pragma unroll
+      for(int k = 0; k < num_values_4bit/4; k++)
+      {
+        #if __CUDA_ARCH__ >= 800
+          local_C += (float)(local_A[k]*local_B[k]);
+        #else
+          // bf16 multipliation not supported
+          local_C += ((float)local_A[k]*(float)local_B[k]);
+        #endif
+      }
+    }
+  }
+
+  local_C = WarpReduce(temp_storage[warp_idx]).Sum(local_C);
+
+  if(row_B < M && warp_lane == 0)
+    out[row_B] = T(local_C);
+
+}
+
+
 //#define ROWS 2
 //template <typename T, int ITEMS, int THREADS> __global__ void gemm_device(int M, int N, int K, T const* A,  T* B,  T * out,  int lda, int ldb, int ldc)
 //{
@@ -3603,8 +3785,14 @@ template __global__ void gemm_device<half, 16, 32>(int M, int N, int K, half * _
 template __global__ void gemm_device<half, 16, 64>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
 template __global__ void gemm_device<half, 16, 96>(int M, int N, int K, half * __restrict__ const A,  half* B,  half * out,  int lda, int ldb, int ldc);
 
+template __global__ void kgemm_4bit_inference<half, 96>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
 template __global__ void kgemm_4bit_inference<half, 128>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
 template __global__ void kgemm_4bit_inference<half, 160>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+template __global__ void kgemm_4bit_inference<half, 256>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+
+template __global__ void kgemm_4bit_inference_naive<half, 128, 16>(int M, int N, int K, half * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize);
+template __global__ void kgemm_4bit_inference_naive<hip_bfloat16, 128, 16>(int M, int N, int K, hip_bfloat16 * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize);
+template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N, int K, float * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize);
 
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
@@ -3763,15 +3951,6 @@ MAKE_kQuantizeBlockwise(half,   256, 2, 0, General8bit)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, General8bit)
 //MAKE_kQuantizeBlockwise(half,    64, 2, 0, General8bit)
 
-MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit)
-MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit)
-MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit)
-MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
-MAKE_kQuantizeBlockwise(float,  512, 2, 0, General8bit)
-MAKE_kQuantizeBlockwise(float,  256, 2, 0, General8bit)
-MAKE_kQuantizeBlockwise(float,  128, 2, 0, General8bit)
-//MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
-
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, FP4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, FP4)
@@ -3780,16 +3959,6 @@ MAKE_kQuantizeBlockwise(half,   256, 2, 0, FP4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, FP4)
 //MAKE_kQuantizeBlockwise(half,    64, 2, 0, FP4)
 
-
-MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4)
-MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4)
-MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4)
-MAKE_kQuantizeBlockwise(float,  512, 2, 0, FP4)
-MAKE_kQuantizeBlockwise(float,  256, 2, 0, FP4)
-MAKE_kQuantizeBlockwise(float,  128, 2, 0, FP4)
-//MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
-
-
 MAKE_kQuantizeBlockwise(half,  4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  2048, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(half,  1024, 4, 0, NF4)
@@ -3798,6 +3967,22 @@ MAKE_kQuantizeBlockwise(half,   256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(half,   128, 2, 0, NF4)
 //MAKE_kQuantizeBlockwise(half,    64, 2, 0, NF4)
 
+MAKE_kQuantizeBlockwise(float, 4096, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float, 4096, 4, 1, General8bit)
+MAKE_kQuantizeBlockwise(float, 2048, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float, 1024, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  512, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  256, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(float,  128, 2, 0, General8bit)
+//MAKE_kQuantizeBlockwise(float,   64, 2, 0, General8bit)
+
+MAKE_kQuantizeBlockwise(float, 4096, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float, 2048, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float, 1024, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  512, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  256, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(float,  128, 2, 0, FP4)
+//MAKE_kQuantizeBlockwise(float,   64, 2, 0, FP4)
 
 MAKE_kQuantizeBlockwise(float, 4096, 4, 0, NF4)
 MAKE_kQuantizeBlockwise(float, 2048, 4, 0, NF4)
@@ -3807,13 +3992,40 @@ MAKE_kQuantizeBlockwise(float,  256, 2, 0, NF4)
 MAKE_kQuantizeBlockwise(float,  128, 2, 0, NF4)
 //MAKE_kQuantizeBlockwise(float,   64, 2, 0, NF4)
 
+MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 1, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  512, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  256, 2, 0, General8bit)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  128, 2, 0, General8bit)
+//MAKE_kQuantizeBlockwise(hip_bfloat16,   64, 2, 0, General8bit)
+
+MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, FP4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  512, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  256, 2, 0, FP4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  128, 2, 0, FP4)
+//MAKE_kQuantizeBlockwise(hip_bfloat16,   64, 2, 0, FP4)
+
+MAKE_kQuantizeBlockwise(hip_bfloat16, 4096, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 2048, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(hip_bfloat16, 1024, 4, 0, NF4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  512, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  256, 2, 0, NF4)
+MAKE_kQuantizeBlockwise(hip_bfloat16,  128, 2, 0, NF4)
+//MAKE_kQuantizeBlockwise(hip_bfloat16,   64, 2, 0, NF4)
 
 template __global__ void kDequantizeBlockwise<half, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
-template __global__ void kDequantizeBlockwise<float, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
 template __global__ void kDequantizeBlockwise<half, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
-template __global__ void kDequantizeBlockwise<float, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
 template __global__ void kDequantizeBlockwise<half, 512, 64, 8, NF4>(float *code, unsigned char * A, float * absmax, half *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<float, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<float, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
 template __global__ void kDequantizeBlockwise<float, 512, 64, 8, NF4>(float *code, unsigned char * A, float * absmax, float *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<hip_bfloat16, 512, 64, 8, FP4>(float *code, unsigned char * A, float * absmax, hip_bfloat16 *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<hip_bfloat16, 512, 64, 8, General8bit>(float *code, unsigned char * A, float * absmax, hip_bfloat16 *out, const int blocksize, const int n);
+template __global__ void kDequantizeBlockwise<hip_bfloat16, 512, 64, 8, NF4>(float *code, unsigned char * A, float * absmax, hip_bfloat16 *out, const int blocksize, const int n);
 
 #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread) \
 template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block_size, num_per_thread>(gtype* p, gtype* __restrict__ const g, unsigned char* state1, unsigned char* state2, \
diff --git a/csrc/kernels.hiph b/csrc/kernels.hiph
index 1abae60d4..c842cc754 100644
--- a/csrc/kernels.hiph
+++ b/csrc/kernels.hiph
@@ -108,6 +108,7 @@ template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ voi
 
 template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
 
+
 __global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
 
 
@@ -126,6 +127,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 
 template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc);
 template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
 
 template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n);
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 08f6fe122..09042028d 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -869,10 +869,21 @@ template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsi
 	//cout << m << endl;
 	//cout << n << endl;
 	//cout << k << endl;
- hipLaunchKernelGGL(( kgemm_4bit_inference<T, 160>),  dim3(num_blocks), dim3(160), 0, 0 , m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
+ hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>),  dim3(num_blocks), dim3(96), 0, 0 , m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
+  //kgemm_4bit_inference<T, 256><<< num_blocks, 256, 0, 0 >>>(m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
+  //kgemm_4bit_inference<T, 160><<< num_blocks, 160, 0, 0 >>>(m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
   //kgemm_4bit_inference<T, 32><<< num_blocks, 32, 0, 0 >>>(m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
 }
 
+template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
+{
+
+	int num_blocks = (m+3)/4;
+
+  hipLaunchKernelGGL(( kgemm_4bit_inference_naive<T, 128, BITS>), dim3(num_blocks), dim3(128), 0, 0 , m,  n,  k, A,  B, absmax, datatype, out, lda, ldb, ldc, blocksize);
+  CUDA_CHECK_RETURN(hipPeekAtLastError());
+}
+
 template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
 {
   int threads = 512;
@@ -893,6 +904,10 @@ template void func<float, ARANGE>(float *A, float *B, float value, long n);
 template void func<float, _MUL>(float *A, float *B, float value, long n);
 
 template void gemm_4bit_inference<half>(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
+template void gemm_4bit_inference_naive<half, 16>(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize);
+template void gemm_4bit_inference_naive<hip_bfloat16, 16>(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize);
+template void gemm_4bit_inference_naive<float, 32>(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize);
+
 //template void gemm_host<float>(int m, int n, int k, float * A,  float* B,  float * out,  int lda, int ldb, int ldc, int bits);
 template void gemm_host<half>(int m, int n, int k, half * A,  half* B,  half * out,  int lda, int ldb, int ldc, int bits);
 template void extractOutliers<COL_TURING>(char * A, int *idx, char *out, int idx_size, int rows, int cols);
@@ -921,19 +936,27 @@ template void estimateQuantiles(half *A, float *code, float offset, int n);
 template void estimateQuantiles(float *A, float *code, float offset, int n);
 
 template void quantizeBlockwise<half, 1, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
-template void quantizeBlockwise<float, 1, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<half, 0, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
-template void quantizeBlockwise<float, 0, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<half, 0, FP4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
-template void quantizeBlockwise<float, 0, FP4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<half, 0, NF4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 1, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 0, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<float, 0, FP4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<float, 0, NF4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
-template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void quantizeBlockwise<hip_bfloat16, 1, General8bit>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<hip_bfloat16, 0, General8bit>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<hip_bfloat16, 0, FP4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template void quantizeBlockwise<hip_bfloat16, 0, NF4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+
 template void dequantizeBlockwise<float, General8bit>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
-template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
 template void dequantizeBlockwise<float, FP4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
-template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
 template void dequantizeBlockwise<float, NF4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
+template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<hip_bfloat16, General8bit>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
+template void dequantizeBlockwise<hip_bfloat16, FP4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
+template void dequantizeBlockwise<hip_bfloat16, NF4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
 
 #define MAKE_optimizer32bit(name, gtype) \
 template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
diff --git a/csrc/ops.hiph b/csrc/ops.hiph
index 2a671509f..8e41f852a 100644
--- a/csrc/ops.hiph
+++ b/csrc/ops.hiph
@@ -209,6 +209,7 @@ void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rows
 
 template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits);
 template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
 
 template <typename T, int FUNC> void func(T *A, T *B, T value, long n);
 
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 34a5d86fb..ba551dcc3 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -34,8 +34,8 @@ void gemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  floa
 void gemm_4bit_inference_naive_fp16(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize)
 { gemm_4bit_inference_naive<half, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
-void gemm_4bit_inference_naive_bf16(int m, int n, int k, __nv_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, __nv_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
-{ gemm_4bit_inference_naive<__nv_bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
+void gemm_4bit_inference_naive_bf16(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
+{ gemm_4bit_inference_naive<hip_bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 void gemm_4bit_inference_naive_fp32(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize)
 { gemm_4bit_inference_naive<float, 32>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
@@ -118,9 +118,9 @@ void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char
 void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 
-void quantizeBlockwise_bf16(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_bf16_fp4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_bf16_nf4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_bf16(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_bf16_fp4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_bf16_nf4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 
 void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
@@ -134,9 +134,9 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa
 void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, FP4>(NULL, A, absmax, out, blocksize, n); }
 void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
 
-void dequantizeBlockwise_bf16(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, General8bit>(code, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, FP4>(NULL, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, NF4>(NULL, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_bf16(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, General8bit>(code, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, FP4>(NULL, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, NF4>(NULL, A, absmax, out, blocksize, n); }
 
 
 #ifndef NO_HIPBLASLT
@@ -255,13 +255,13 @@ extern "C"
   void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
   void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
 
-  void cquantize_blockwise_bf16(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_bf16_fp4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_bf16_nf4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_bf16(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_bf16_fp4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_bf16_nf4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
 
-  void cdequantize_blockwise_bf16(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_bf16(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
 
 	#define MAKE_CFUNC32(name, gtype, gbits) \
 	void c##name##32bit_grad_##gbits(gtype *g, gtype *p, \
@@ -504,6 +504,11 @@ extern "C"
 
 	void cprefetch(void *ptr, size_t bytes, int device)
 	{
+
+		int hasPrefetch = 0;
+		CUDA_CHECK_RETURN(hipDeviceGetAttribute(&hasPrefetch, hipDeviceAttributeConcurrentManagedAccess, device)); // 40ns overhead
+		if (hasPrefetch == 0) return;
+ 
 		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(hipPeekAtLastError());
 	}
@@ -520,7 +525,7 @@ extern "C"
 	void cgemm_4bit_inference_naive_fp16(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize)
 	{ gemm_4bit_inference_naive_fp16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
-	void cgemm_4bit_inference_naive_bf16(int m, int n, int k, __nv_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, __nv_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
+	void cgemm_4bit_inference_naive_bf16(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
 	{ gemm_4bit_inference_naive_bf16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 	void cgemm_4bit_inference_naive_fp32(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize)

From 1295d53c0c3518e94972e5b222844141da34d4f5 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Sat, 27 Jan 2024 00:08:57 +0000
Subject: [PATCH 049/234] Skip failing tests for now

---
 tests/test_functional.py   | 5 ++++-
 tests/test_generation.py   | 3 +++
 tests/test_linear8bitlt.py | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5fb74a841..565a45f3f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -583,7 +583,7 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose):
 values = list(product(dim1, dim2, dim3))
 names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]
 
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
 def test_vector_quant(dim1, dim2, dim3):
     dim2 = dim2 - (dim2 % 16)
@@ -2047,6 +2047,7 @@ def quant_zp(x):
     print(err1, err2, err3, err4, err5, err6)
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_extract_outliers():
     for i in range(k):
         shapeA = (4096, 4096 * 4)
@@ -2380,6 +2381,7 @@ def test_normal_map_tree():
 @pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'], ids=['fc1', 'fc2', 'attn', 'attn_packed'])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=['uint8', 'fp16', 'bf16', 'fp32'])
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
     for dim in [128, 256, 512, 1024]:
     #for dim in [4*1024]:
@@ -2547,6 +2549,7 @@ def test_managed():
 @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4'])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True'])
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))
diff --git a/tests/test_generation.py b/tests/test_generation.py
index ecafdddf8..54ec10475 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -15,6 +15,8 @@
 
 )
 
+import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 
 def get_4bit_config():
@@ -79,6 +81,7 @@ def model_and_tokenizer(request):
 @pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False'])
 @pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False'])
 #@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ):
     print('')
     dtype = torch.float16
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 6d48bee34..6d5fc6a82 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -15,6 +15,7 @@
 # contributed by Alex Borzunov, see:
 # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.skipif(
     not torch.cuda.is_available() or torch.cuda.get_device_capability() < (7, 5),
     reason="this test requires a turing-generation or newer GPU, see bitsandbytes docs",

From f1a0b8b33f595fb3cbf12abd54d59d029de99f5a Mon Sep 17 00:00:00 2001
From: iiisak <41641715+iiisak@users.noreply.github.com>
Date: Sat, 3 Feb 2024 00:33:59 +0100
Subject: [PATCH 050/234] ops.hip: adapt to enum naming changes in
 ROCm/hipBLASLt@95131d6 and ROCm/hipBLASLt@3aad0d8

---
 csrc/ops.hip | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 09042028d..54743d111 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -393,15 +393,15 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
 
   if(DTYPE == 8)
   {
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_8I, dim1, dim2, ldA));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIPBLASLT_R_8I, 0, 0, 0));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_8I, dim1, dim2, ldOut));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIP_R_8I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIP_R_8I, 0, 0, 0));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim1, dim2, ldOut));
   }
   else if(DTYPE == 32)
   {
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIPBLASLT_R_32I, dim1, dim2, ldA));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIPBLASLT_R_32I, 0, 0, 0));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIPBLASLT_R_32I, dim1, dim2, ldOut));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIP_R_32I, dim1, dim2, ldA));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIP_R_32I, 0, 0, 0));
+    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim1, dim2, ldOut));
   }
   else
   {
@@ -411,7 +411,7 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
   checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(A_desc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderA, sizeof(orderA)));
   checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(out_desc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &orderOut, sizeof(orderOut)));
 
-  checkHipblasStatus(hipblasLtMatrixTransformDescCreate(&A2Out_desc, HIPBLASLT_R_32F));
+  checkHipblasStatus(hipblasLtMatrixTransformDescCreate(&A2Out_desc, HIP_R_32F));
 
   if(transpose){ checkHipblasStatus(hipblasLtMatrixTransformDescSetAttribute(A2Out_desc, HIPBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &opTranspose, sizeof(opTranspose))); }
 
@@ -510,8 +510,8 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     hipblasLtOrder_t col_turing = HIPBLASLT_ORDER_COL;
     hipblasLtOrder_t col_ampere = HIPBLASLT_ORDER_COL;
 
-    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIPBLASLT_R_8I, m, k, lda));
-    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIPBLASLT_R_8I, n, k, ldb));
+    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIP_R_8I, m, k, lda));
+    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIP_R_8I, n, k, ldb));
     has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
 
 
@@ -528,14 +528,14 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
 
     if(DTYPE_OUT == 32)
     {
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLASLT_COMPUTE_I32, HIPBLASLT_R_32I));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_32I));
       auto opA = HIPBLAS_OP_N;
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(int32_t)));
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(int32_t)));
       hipblasLtEpilogue_t epilogue = HIPBLASLT_EPILOGUE_DEFAULT;
       checkHipblasStatus(hipblasLtMatmulDescSetAttribute(
         matmulDesc, HIPBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_32I, m, n, ldc));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIP_R_32I, m, n, ldc));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       int alpha = 1, beta = 0;
 
@@ -578,9 +578,9 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     }
     else
     {
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLASLT_COMPUTE_I32, HIPBLASLT_R_32F));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_32F));
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIPBLASLT_R_8I, m, n, ldc));
+      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIP_R_8I, m, n, ldc));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       /* Algo and workspace TODO: need to rework to not be duplicated */
       // Set User Preference attributes

From cebd83c10e4c4847e448ad0949af033bd8d1c4ef Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Tue, 6 Feb 2024 07:21:01 -0800
Subject: [PATCH 051/234] refine backend register with base-backend

---
 bitsandbytes/backends/__init__.py      | 28 +++-----
 bitsandbytes/backends/basic_backend.py | 92 ++++++++++++++++++++++++++
 bitsandbytes/backends/cuda.py          |  8 ++-
 3 files changed, 107 insertions(+), 21 deletions(-)
 create mode 100644 bitsandbytes/backends/basic_backend.py

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index fd046d506..496e7d671 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,5 +1,4 @@
-from .cuda import CUDABackend
-
+from bitsandbytes.cextension import COMPILED_WITH_CUDA
 
 class Backends:
     """
@@ -13,28 +12,17 @@ class Backends:
     devices = {}
 
     @classmethod
-    def register_backend(cls, backend_name: str, backend_class):
+    def register_backend(cls, backend_name: str, backend_instance):
         assert backend_name.lower() in {
             "cpu",
             "cuda",
             "xpu",
         }, "register device backend choices in [cpu, cuda, xpu]"
 
-        # check 8bits and 4bits interfaces
-        if (
-            hasattr(backend_class, "double_quant")
-            and hasattr(backend_class, "transform")
-            and hasattr(backend_class, "igemmlt")
-            and hasattr(backend_class, "mm_dequant")
-            and hasattr(backend_class, "extract_outliers")
-            and hasattr(backend_class, "quantize_4bit")
-            and hasattr(backend_class, "dequantize_4bit")
-        ):
-            cls.devices[backend_name.lower()] = backend_class
-        else:
-            assert (
-                False
-            ), f"register device backend {backend_name.lower()} but its interfaces are not compelete"
-
+        cls.devices[backend_name.lower()] = backend_instance
 
-Backends.register_backend("cuda", CUDABackend)
+if COMPILED_WITH_CUDA:
+    from .cuda import CUDABackend
+    cuda_backend = CUDABackend(torch.device("cuda").type)
+    Backends.register_backend(cuda_backend.get_name(), cuda_backend)
+# TODO: register more backends support 
\ No newline at end of file
diff --git a/bitsandbytes/backends/basic_backend.py b/bitsandbytes/backends/basic_backend.py
new file mode 100644
index 000000000..8565c5f73
--- /dev/null
+++ b/bitsandbytes/backends/basic_backend.py
@@ -0,0 +1,92 @@
+from abc import ABC, abstractmethod
+import torch
+from typing import Optional, Tuple
+from bitsandbytes.functional import QuantState
+
+
+class DeviceBackends(ABC):
+    """Base class for devices backends that will implement their own 8bits and 4bits functions."""
+
+    @abstractmethod
+    def get_name(self) -> str:
+        """Name of the device as the backend support."""
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def double_quant(
+        cls,
+        A,
+        col_stats=None,
+        row_stats=None,
+        out_col=None,
+        out_row=None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def transform(
+        cls,
+        A,
+        to_order,
+        from_order="row",
+        out=None,
+        transpose=False,
+        state=None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def igemmlt(cls, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def mm_dequant(
+        cls,
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None,
+    ):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def extract_outliers(cls, A, SA, idx):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def quantize_4bit(
+        cls,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def dequantize_4bit(
+        cls,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index f90c3d1e9..7680bf2a1 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -18,9 +18,15 @@
 )
 from bitsandbytes.functional import CUBLAS_Context, QuantState
 from bitsandbytes.cextension import lib
+from .basic_backend import DeviceBackends
 
+class CUDABackend(DeviceBackends):
+    def __init__(self, backend_name: str):
+        self.backend_name = backend_name
+
+    def get_name(self) -> str:
+        return self.backend_name
 
-class CUDABackend:
     @classmethod
     def double_quant(
         cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0

From d20c01764d4980699667795c9711c7d505b9db1c Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Tue, 6 Feb 2024 23:26:19 +0800
Subject: [PATCH 052/234] minor clean format

---
 tests/test_cuda_setup_evaluator.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 914b7414a..e3620bf41 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -19,11 +19,3 @@ def test_manual_override(requires_cuda):
     import bitsandbytes as bnb
     loaded_lib = bnb.device_setup.cuda.main.CUDASetup.get_instance().binary_name
     #assert loaded_lib == 'libbitsandbytes_cuda122.so'
-
-
-
-
-
-
-
-

From a84c369a1267f7dcd345208bbcc1a78466eb641d Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Tue, 6 Feb 2024 23:15:32 +0000
Subject: [PATCH 053/234] fix wmma api parity

---
 csrc/kernels.hip | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 64a93cc6e..f48f8b991 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -3093,10 +3093,10 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
   __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
   //__shared__ T smem_C[8*32];
 
-   wmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
-   wmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
-   wmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
-   wmma::fill_fragment(c_frag, 0.0f);
+   rocwmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
+   rocwmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
+   rocwmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   rocwmma::fill_fragment(c_frag, 0.0f);
 
   int ticktock = 0;
   int idx = 0 + threadIdx.x;
@@ -3251,9 +3251,9 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
     if(warp_id == (WARPS-1))
       for(int k = 0; k < batch_size_warps; k++)
       {
-        wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
-        wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
-        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+        rocwmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+        rocwmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+        rocwmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
       }
   }
 
@@ -3265,14 +3265,14 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
   ticktock = ticktock == 0 ? 1 : 0;
   for(int k = 0; k < batch_size_warps; k++)
   {
-    wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
-    wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
-    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+    rocwmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+    rocwmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+    rocwmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
   }
 
   // 129 mu
   if(warp_id == (WARPS-1))
-    wmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
+    rocwmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
 
   if(col_offset + warp_lane < M)
     out[col_offset + warp_lane] = smem_A[warp_lane];
@@ -3322,10 +3322,10 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
   __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
   __shared__ T smem_C[8*32];
 
-   wmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
-   wmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
-   wmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
-   wmma::fill_fragment(c_frag, 0.0f);
+   rocwmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
+   rocwmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
+   rocwmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   rocwmma::fill_fragment(c_frag, 0.0f);
 
   for(int i = threadIdx.x; i < (8*32); i+=blockDim.x)
     smem_C[i] = 0.0f;
@@ -3466,9 +3466,9 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
     if(warp_id == (WARPS-1))
       for(int k = 0; k < batch_size_warps; k++)
       {
-        wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
-        wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
-        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+        rocwmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+        rocwmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+        rocwmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
       }
   }
 
@@ -3487,14 +3487,14 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
   {
     //if(warp_lane == 0)
       //printf("%i %i %i %i\n", (ticktock*batch_size_warps + k)*a_tile_offset, k, ticktock, threadIdx.x);
-    wmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
-    wmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
-    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+    rocwmma::load_matrix_sync(a_frag, &(smem_A[(ticktock*batch_size_warps + k)*a_tile_offset]), 16); //  111 mu
+    rocwmma::load_matrix_sync(b_frag, &(smem_B[(ticktock*batch_size_warps + k)*b_tile_offset]), 16); // 35 mu
+    rocwmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
   }
 
   // 129 mu
   if(warp_id == (WARPS-1))
-    wmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major);
+    rocwmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major);
 
   //printnonzero<T>(smem_C, 32, "");
 

From b044010a6a6864be6ed115a4e5a9d271755b53e0 Mon Sep 17 00:00:00 2001
From: Lzy17 <joeylzy0710@gmail.com>
Date: Wed, 7 Feb 2024 01:56:30 +0000
Subject: [PATCH 054/234] hipify wmma datatype

---
 csrc/kernels.hip | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index f48f8b991..1f8c97e32 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -3093,9 +3093,9 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
   __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
   //__shared__ T smem_C[8*32];
 
-   rocwmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
-   rocwmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
-   rocwmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   rocwmma::fragment<rocwmma::matrix_a, 8, 32, 16, half, rocwmma::row_major> a_frag;
+   rocwmma::fragment<rocwmma::matrix_b, 8, 32, 16, half, rocwmma::col_major> b_frag;
+   rocwmma::fragment<rocwmma::accumulator, 8, 32, 16, half> c_frag;
    rocwmma::fill_fragment(c_frag, 0.0f);
 
   int ticktock = 0;
@@ -3272,7 +3272,7 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
 
   // 129 mu
   if(warp_id == (WARPS-1))
-    rocwmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, wmma::mem_row_major);
+    rocwmma::store_matrix_sync(&(smem_A[0]), c_frag, 32, rocwmma::mem_row_major);
 
   if(col_offset + warp_lane < M)
     out[col_offset + warp_lane] = smem_A[warp_lane];
@@ -3322,9 +3322,9 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
   __shared__ T smem_B[2*batch_size_warps*16*32 + (2*16*(batch_size_warps-1))];
   __shared__ T smem_C[8*32];
 
-   rocwmma::fragment<wmma::matrix_a, 8, 32, 16, half, wmma::row_major> a_frag;
-   rocwmma::fragment<wmma::matrix_b, 8, 32, 16, half, wmma::col_major> b_frag;
-   rocwmma::fragment<wmma::accumulator, 8, 32, 16, half> c_frag;
+   rocwmma::fragment<rocwmma::matrix_a, 8, 32, 16, half, rocwmma::row_major> a_frag;
+   rocwmma::fragment<rocwmma::matrix_b, 8, 32, 16, half, rocwmma::col_major> b_frag;
+   rocwmma::fragment<rocwmma::accumulator, 8, 32, 16, half> c_frag;
    rocwmma::fill_fragment(c_frag, 0.0f);
 
   for(int i = threadIdx.x; i < (8*32); i+=blockDim.x)
@@ -3494,7 +3494,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 
   // 129 mu
   if(warp_id == (WARPS-1))
-    rocwmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, wmma::mem_row_major);
+    rocwmma::store_matrix_sync(&(smem_C[0]), c_frag, 32, rocwmma::mem_row_major);
 
   //printnonzero<T>(smem_C, 32, "");
 

From b41c1c4d68a6c4b2154c582ec01c2d2b5bd36f63 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Tue, 6 Feb 2024 21:36:50 -0800
Subject: [PATCH 055/234] format in CI

---
 bitsandbytes/__init__.py               |  2 +-
 bitsandbytes/backends/__init__.py      |  3 ++-
 bitsandbytes/backends/basic_backend.py |  4 +++-
 bitsandbytes/backends/cuda.py          | 27 +++++++++++++++-----------
 bitsandbytes/cextension.py             |  1 +
 bitsandbytes/functional.py             |  2 +-
 6 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 1045070cd..512fd2455 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from . import device_setup, utils, research
+from . import device_setup, research, utils
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 496e7d671..bf8a76cba 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,5 +1,6 @@
 from bitsandbytes.cextension import COMPILED_WITH_CUDA
 
+
 class Backends:
     """
     An dict class for device backends that registered with 8bits and 4bits functions.
@@ -25,4 +26,4 @@ def register_backend(cls, backend_name: str, backend_instance):
     from .cuda import CUDABackend
     cuda_backend = CUDABackend(torch.device("cuda").type)
     Backends.register_backend(cuda_backend.get_name(), cuda_backend)
-# TODO: register more backends support 
\ No newline at end of file
+# TODO: register more backends support
diff --git a/bitsandbytes/backends/basic_backend.py b/bitsandbytes/backends/basic_backend.py
index 8565c5f73..b97723d81 100644
--- a/bitsandbytes/backends/basic_backend.py
+++ b/bitsandbytes/backends/basic_backend.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
-import torch
 from typing import Optional, Tuple
+
+import torch
+
 from bitsandbytes.functional import QuantState
 
 
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 7680bf2a1..965138a69 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -1,25 +1,30 @@
-import torch
-from torch import Tensor
 import ctypes as ct
 from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from bitsandbytes.cextension import lib
 from bitsandbytes.functional import (
-    pre_call,
-    post_call,
+    CUBLAS_Context,
+    QuantState,
+    coo_zeros,
+    dequantize_blockwise,
+    dtype2bytes,
+    get_4bit_type,
     get_colrow_absmax,
     get_ptr,
-    is_on_gpu,
-    coo_zeros,
     get_transform_buffer,
+    is_on_gpu,
+    post_call,
+    pre_call,
     prod,
-    get_4bit_type,
     quantize_blockwise,
-    dequantize_blockwise,
-    dtype2bytes,
 )
-from bitsandbytes.functional import CUBLAS_Context, QuantState
-from bitsandbytes.cextension import lib
+
 from .basic_backend import DeviceBackends
 
+
 class CUDABackend(DeviceBackends):
     def __init__(self, backend_name: str):
         self.backend_name = backend_name
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 0848784c0..dab34982e 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -2,6 +2,7 @@
 from warnings import warn
 
 import torch
+
 from bitsandbytes.device_setup.cuda.main import CUDASetup
 
 setup = CUDASetup.get_instance()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index c2fb491dd..f8a9723cb 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2081,6 +2081,7 @@ def pipeline_test(A, batch_size):
 
 from bitsandbytes.backends import Backends
 
+
 # 8 bits common functions
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
@@ -2127,4 +2128,3 @@ def quantize_4bit(
 def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
     assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
     return Backends.devices[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
-

From 1ab611e889a2ad069093e355fb6921486b59856c Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Tue, 6 Feb 2024 21:59:50 -0800
Subject: [PATCH 056/234] minor fix for format

---
 bitsandbytes/backends/__init__.py | 2 +-
 bitsandbytes/functional.py        | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index bf8a76cba..793d98dc5 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,5 +1,5 @@
 from bitsandbytes.cextension import COMPILED_WITH_CUDA
-
+import torch
 
 class Backends:
     """
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f8a9723cb..e6649ba34 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -16,6 +16,7 @@
 
 from .cextension import COMPILED_WITH_CUDA, lib
 
+from bitsandbytes.backends import Backends
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -2079,9 +2080,6 @@ def pipeline_test(A, batch_size):
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
     return out
 
-from bitsandbytes.backends import Backends
-
-
 # 8 bits common functions
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"

From b933f9f1c686979d6dbf9ea97c753561162459e9 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 7 Feb 2024 23:00:06 +0800
Subject: [PATCH 057/234] refactor base backend registering

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 bitsandbytes/backends/__init__.py | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 793d98dc5..084cfa3e0 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,29 +1,14 @@
-from bitsandbytes.cextension import COMPILED_WITH_CUDA
+import typing
 import torch
 
-class Backends:
-    """
-    An dict class for device backends that registered with 8bits and 4bits functions.
-
-    The values of this device backends are lowercase strings, e.g., ``"cuda"``. They can
-    be accessed as attributes with key-value, e.g., ``Backends.device["cuda"]``.
-
-    """
-
-    devices = {}
+from bitsandbytes.cextension import COMPILED_WITH_CUDA
+from bitsandbytes.backends.base import Backend
 
-    @classmethod
-    def register_backend(cls, backend_name: str, backend_instance):
-        assert backend_name.lower() in {
-            "cpu",
-            "cuda",
-            "xpu",
-        }, "register device backend choices in [cpu, cuda, xpu]"
+backends: Dict[str, Backend] = {}
 
-        cls.devices[backend_name.lower()] = backend_instance
+def register_backend(backend_name: str, backend_instance: Backend):
+    backends[backend_name.lower()] = backend_instance
 
 if COMPILED_WITH_CUDA:
     from .cuda import CUDABackend
-    cuda_backend = CUDABackend(torch.device("cuda").type)
-    Backends.register_backend(cuda_backend.get_name(), cuda_backend)
-# TODO: register more backends support
+    register_backend("cuda", CUDABackend())

From 8b4baaa4ac53dc5051ba29cd5cd4093f7b149aad Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 7 Feb 2024 07:38:24 -0800
Subject: [PATCH 058/234] refine structures of backends

---
 bitsandbytes/backends/__init__.py      |   2 +-
 bitsandbytes/backends/base.py          | 133 ++++++++++++
 bitsandbytes/backends/basic_backend.py |  94 ---------
 bitsandbytes/backends/cuda.py          |  88 ++------
 bitsandbytes/functional.py             | 275 ++++++++++---------------
 bitsandbytes/utils.py                  | 120 ++++++++++-
 6 files changed, 372 insertions(+), 340 deletions(-)
 create mode 100644 bitsandbytes/backends/base.py
 delete mode 100644 bitsandbytes/backends/basic_backend.py

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 084cfa3e0..0ae01a3d3 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,4 +1,4 @@
-import typing
+from typing import Dict
 import torch
 
 from bitsandbytes.cextension import COMPILED_WITH_CUDA
diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
new file mode 100644
index 000000000..8232d17c1
--- /dev/null
+++ b/bitsandbytes/backends/base.py
@@ -0,0 +1,133 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+
+class Backend(ABC):
+    """Base class for devices backends that will implement their own 8bits and 4bits functions."""
+
+    @abstractmethod
+    def double_quant(
+        self,
+        A,
+        col_stats=None,
+        row_stats=None,
+        out_col=None,
+        out_row=None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def transform(
+        self,
+        A,
+        to_order,
+        from_order="row",
+        out=None,
+        transpose=False,
+        state=None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        raise NotImplementedError
+
+    @abstractmethod
+    def mm_dequant(
+        self,
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def extract_outliers(self, A, SA, idx):
+        raise NotImplementedError
+
+    @abstractmethod
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        """
+        Quantize tensor A in blocks of 4-bit values.
+
+        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            The output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+        Returns
+        -------
+        torch.Tensor:
+            Tensor with packed 4-bit values.
+        tuple(torch.Tensor, torch.Size, torch.dtype, int):
+            The quantization state to undo the quantization.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> torch.Tensor:
+        """
+        Dequantizes FP4 blockwise quantized values.
+
+        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor (packed 4-bit values).
+        quant_state : QuantState
+            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            Dequantized output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+
+        Returns
+        -------
+        torch.Tensor:
+            Dequantized tensor.
+        """
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/basic_backend.py b/bitsandbytes/backends/basic_backend.py
deleted file mode 100644
index b97723d81..000000000
--- a/bitsandbytes/backends/basic_backend.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Optional, Tuple
-
-import torch
-
-from bitsandbytes.functional import QuantState
-
-
-class DeviceBackends(ABC):
-    """Base class for devices backends that will implement their own 8bits and 4bits functions."""
-
-    @abstractmethod
-    def get_name(self) -> str:
-        """Name of the device as the backend support."""
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def double_quant(
-        cls,
-        A,
-        col_stats=None,
-        row_stats=None,
-        out_col=None,
-        out_row=None,
-        threshold=0.0,
-    ):
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def transform(
-        cls,
-        A,
-        to_order,
-        from_order="row",
-        out=None,
-        transpose=False,
-        state=None,
-        ld=None,
-    ):
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def igemmlt(cls, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def mm_dequant(
-        cls,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None,
-    ):
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def extract_outliers(cls, A, SA, idx):
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def quantize_4bit(
-        cls,
-        A: torch.Tensor,
-        absmax: Optional[torch.Tensor] = None,
-        out: Optional[torch.Tensor] = None,
-        blocksize=64,
-        compress_statistics=False,
-        quant_type="fp4",
-        quant_storage=torch.uint8,
-    ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def dequantize_4bit(
-        cls,
-        A: torch.Tensor,
-        quant_state: Optional[QuantState] = None,
-        absmax: Optional[torch.Tensor] = None,
-        out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
-        quant_type="fp4",
-    ) -> torch.Tensor:
-        raise NotImplementedError
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 965138a69..248d1e4c1 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -2,12 +2,10 @@
 from typing import Optional, Tuple
 
 import torch
-from torch import Tensor
 
 from bitsandbytes.cextension import lib
 from bitsandbytes.functional import (
     CUBLAS_Context,
-    QuantState,
     coo_zeros,
     dequantize_blockwise,
     dtype2bytes,
@@ -22,19 +20,14 @@
     quantize_blockwise,
 )
 
-from .basic_backend import DeviceBackends
+from bitsandbytes.utils import QuantState
 
+from .base import Backend
 
-class CUDABackend(DeviceBackends):
-    def __init__(self, backend_name: str):
-        self.backend_name = backend_name
 
-    def get_name(self) -> str:
-        return self.backend_name
-
-    @classmethod
+class CUDABackend(Backend):
     def double_quant(
-        cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+        self, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
     ):
         device = A.device
         assert A.dtype == torch.half
@@ -128,8 +121,7 @@ def double_quant(
 
         return out_row, out_col, row_stats, col_stats, coo_tensor
 
-    @classmethod
-    def transform(cls, A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    def transform(self, A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
         prev_device = pre_call(A.device)
         if state is None: state = (A.shape, from_order)
         else: from_order = state[1]
@@ -172,8 +164,7 @@ def transform(cls, A, to_order, from_order='row', out=None, transpose=False, sta
 
         return out, new_state
 
-    @classmethod
-    def igemmlt(cls, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         shapeA = SA[0]
         shapeB = SB[0]
         dimsA = len(shapeA)
@@ -272,9 +263,8 @@ def igemmlt(cls, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
 
         return out, Sout
 
-    @classmethod
     def mm_dequant(
-        cls,
+        self,
         A,
         quant_state,
         row_stats,
@@ -324,8 +314,7 @@ def mm_dequant(
 
         return out
 
-    @classmethod
-    def extract_outliers(cls, A, SA, idx):
+    def extract_outliers(self, A, SA, idx):
         shapeA = SA[0]
         formatA = SA[1]
         assert formatA in ["col_turing", "col_ampere"]
@@ -351,42 +340,16 @@ def extract_outliers(cls, A, SA, idx):
 
         return out
 
-    @classmethod
     def quantize_4bit(
-        cls,
-        A: Tensor,
+        self,
+        A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         blocksize=64,
         compress_statistics=False,
         quant_type='fp4',
         quant_storage=torch.uint8,
-    ) -> Tuple[Tensor, QuantState]:
-        """
-        Quantize tensor A in blocks of 4-bit values.
-
-        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input tensor.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            The output tensor.
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-        Returns
-        -------
-        torch.Tensor:
-            Tensor with packed 4-bit values.
-        tuple(torch.Tensor, torch.Size, torch.dtype, int):
-            The quantization state to undo the quantization.
-        """
+    ) -> Tuple[torch.Tensor, QuantState]:
         if A.device.type != 'cuda':
             raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
         if quant_type not in ['fp4', 'nf4']:
@@ -442,34 +405,7 @@ def quantize_4bit(
 
         return out, state
 
-    @classmethod
-    def dequantize_4bit(cls, A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-        """
-        Dequantizes FP4 blockwise quantized values.
-
-        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input tensor (packed 4-bit values).
-        quant_state : QuantState
-            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            Dequantized output tensor.
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-
-        Returns
-        -------
-        torch.Tensor:
-            Dequantized tensor.
-        """
+    def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> torch.Tensor:
         if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
             raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
         if quant_type not in ['fp4', 'nf4']:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index e6649ba34..b75eac67e 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -16,7 +16,9 @@
 
 from .cextension import COMPILED_WITH_CUDA, lib
 
-from bitsandbytes.backends import Backends
+from bitsandbytes.utils import QuantState
+
+from bitsandbytes.backends import backends
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -589,125 +591,6 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl
     return out
 
 
-class QuantState:
-    """container for quantization state components to work with Params4bit and similar classes"""
-    valid_quant_types = ('fp4', 'nf4')
-    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
-    valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
-                     'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
-
-    def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None):
-        self.absmax = absmax
-        self.shape = shape
-        self.code = code
-        self.dtype = dtype
-        self.blocksize = blocksize
-        self.quant_type = quant_type
-        self.offset = offset
-        self.state2 = state2
-        self.nested = state2 is not None
-
-    def __get_item__(self, idx):
-        """
-        ensures compatibility with older quant state scheme with nested lists.
-        assumes the following layout:
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-        """
-        if self.nested:
-            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, [self.offset, self.state2], self.quant_type]
-        else:
-            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
-        return list_repr[idx]
-
-    @classmethod
-    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState':
-        """
-        unpacks components of state_dict into QuantState
-        where necessary, convert into strings, torch.dtype, ints, etc.
-
-        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
-
-        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
-        """
-
-        # unpacking tensor with non-tensor components
-        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
-        if not len(qs_key) and 'quant_type' not in qs_dict:
-            raise ValueError("Expected packed or unpacked quant_state items, found neither")
-        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
-            raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.")
-
-        # unpacking minor and non-tensor quant state items if necessary
-        if len(qs_key) == 1:
-            first_qs_key = qs_key[0]
-            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key)))
-
-        qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()}  # strip prefixes
-        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
-
-        if 'nested_absmax' in qs_dict:
-            offset = torch.tensor(float(qs_dict['nested_offset'])).to(device)
-            state2 = cls(
-                absmax=qs_dict['nested_absmax'].to(device),
-                blocksize=qs_dict['nested_blocksize'],
-                code=qs_dict['nested_quant_map'].to(device),
-                dtype=getattr(torch, qs_dict['nested_dtype']),
-            )
-        else:
-            offset, state2 = None, None
-
-        quant_state = cls(
-            quant_type=qs_dict['quant_type'],
-            absmax=qs_dict['absmax'].to(device),
-            blocksize=qs_dict['blocksize'],
-            code=qs_dict['quant_map'].to(device),
-            dtype=getattr(torch, qs_dict['dtype']),
-            shape=torch.Size(qs_dict['shape']) if qs_dict['shape'] is not None else None,
-            offset=offset,
-            state2=state2,
-        )
-        return quant_state
-
-    def as_dict(self, packed=False):
-        """
-        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
-        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
-        """
-        qs_dict = {
-            'quant_type': self.quant_type,
-            'absmax': self.absmax,
-            'blocksize': self.blocksize,
-            'quant_map': self.code,
-            'dtype': str(self.dtype).strip('torch.'),
-            'shape': tuple(self.shape),
-        }
-        if self.nested:
-            qs_dict.update({
-                'nested_absmax': self.state2.absmax,
-                'nested_blocksize': self.state2.blocksize,
-                'nested_quant_map': self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
-                'nested_dtype': str(self.state2.dtype).strip('torch.'),
-                'nested_offset': self.offset.item(),
-            })
-        if not packed:
-            return qs_dict
-
-        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
-        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
-        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
-        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
-        return qs_packed_dict
-
-    def to(self, device):
-        # make sure the quantization state is on the right device
-        self.absmax = self.absmax.to(device)
-        if self.nested:
-            self.offset = self.offset.to(device)
-            self.state2.absmax = self.state2.absmax.to(device)
-            self.state2.code = self.state2.code.to(device)
-
-
 def quantize_blockwise(
     A: Tensor,
     code: Optional[torch.Tensor] = None,
@@ -918,12 +801,81 @@ def quantize_fp4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional
 def quantize_nf4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8):
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage)
 
+
+def quantize_4bit(
+    A: Tensor,
+    absmax: Optional[torch.Tensor] = None,
+    out: Optional[torch.Tensor] = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type='fp4',
+    quant_storage=torch.uint8,
+) -> Tuple[Tensor, QuantState]:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}
+
+    Returns
+    -------
+    torch.Tensor:
+        Tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].quantize_4bit(A, absmax=absmax, out=out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage)
+
 def dequantize_fp4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
 
 def dequantize_nf4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
 
+def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+    """
+    Dequantizes FP4 blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor (packed 4-bit values).
+    quant_state : QuantState
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        Dequantized output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor.
+    """
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
+
+
 def quantize(
     A: Tensor,
     code: Optional[torch.Tensor] = None,
@@ -1690,6 +1642,25 @@ def batched_igemm(
     return out
 
 
+def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
+
+
+def mm_dequant(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None
+):
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
+
+
 def get_colrow_absmax(
     A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
 ):
@@ -1823,6 +1794,16 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
 
+def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+
+
+def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+
+
 def spmm_coo(cooA, B, out=None):
     if out is None:
         out = torch.empty(
@@ -2075,54 +2056,12 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
     return x.to(dtype)
 
 
-def pipeline_test(A, batch_size):
-    out = torch.zeros_like(A)
-    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out
-
-# 8 bits common functions
-def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
-
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
-
-def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
-
-def mm_dequant(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None
-):
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
-
 def extract_outliers(A, SA, idx):
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].extract_outliers(A, SA, idx)
+    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    return backends[A.device.type].extract_outliers(A, SA, idx)
 
-# 4 bits common functions
-def quantize_4bit(
-    A: Tensor,
-    absmax: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
-    blocksize=64,
-    compress_statistics=False,
-    quant_type='fp4',
-    quant_storage=torch.uint8,
-) -> Tuple[Tensor, QuantState]:
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].quantize_4bit(A, absmax=absmax, out=out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage)
 
-def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
-    assert A.device.type in Backends.devices, f"Device backend for {A.device.type} is not supported"
-    return Backends.devices[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
+def pipeline_test(A, batch_size):
+    out = torch.zeros_like(A)
+    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
+    return out
\ No newline at end of file
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 0582f7fc0..8c42ddfed 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -1,7 +1,7 @@
 import json
 import shlex
 import subprocess
-from typing import Tuple
+from typing import Tuple, Dict, Any
 
 import torch
 
@@ -200,3 +200,121 @@ def unpack_tensor_to_dict(tensor_data):
     unpacked_dict = json.loads(json_str)
 
     return unpacked_dict
+
+class QuantState:
+    """container for quantization state components to work with Params4bit and similar classes"""
+    valid_quant_types = ('fp4', 'nf4')
+    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
+    valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
+                     'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
+
+    def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None):
+        self.absmax = absmax
+        self.shape = shape
+        self.code = code
+        self.dtype = dtype
+        self.blocksize = blocksize
+        self.quant_type = quant_type
+        self.offset = offset
+        self.state2 = state2
+        self.nested = state2 is not None
+
+    def __get_item__(self, idx):
+        """
+        ensures compatibility with older quant state scheme with nested lists.
+        assumes the following layout:
+        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
+        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
+        """
+        if self.nested:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, [self.offset, self.state2], self.quant_type]
+        else:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
+        return list_repr[idx]
+
+    @classmethod
+    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState':
+        """
+        unpacks components of state_dict into QuantState
+        where necessary, convert into strings, torch.dtype, ints, etc.
+
+        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
+
+        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
+        """
+
+        # unpacking tensor with non-tensor components
+        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
+        if not len(qs_key) and 'quant_type' not in qs_dict:
+            raise ValueError("Expected packed or unpacked quant_state items, found neither")
+        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
+            raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.")
+
+        # unpacking minor and non-tensor quant state items if necessary
+        if len(qs_key) == 1:
+            first_qs_key = qs_key[0]
+            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key)))
+
+        qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()}  # strip prefixes
+        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
+
+        if 'nested_absmax' in qs_dict:
+            offset = torch.tensor(float(qs_dict['nested_offset'])).to(device)
+            state2 = cls(
+                absmax=qs_dict['nested_absmax'].to(device),
+                blocksize=qs_dict['nested_blocksize'],
+                code=qs_dict['nested_quant_map'].to(device),
+                dtype=getattr(torch, qs_dict['nested_dtype']),
+            )
+        else:
+            offset, state2 = None, None
+
+        quant_state = cls(
+            quant_type=qs_dict['quant_type'],
+            absmax=qs_dict['absmax'].to(device),
+            blocksize=qs_dict['blocksize'],
+            code=qs_dict['quant_map'].to(device),
+            dtype=getattr(torch, qs_dict['dtype']),
+            shape=torch.Size(qs_dict['shape']) if qs_dict['shape'] is not None else None,
+            offset=offset,
+            state2=state2,
+        )
+        return quant_state
+
+    def as_dict(self, packed=False):
+        """
+        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
+        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
+        """
+        qs_dict = {
+            'quant_type': self.quant_type,
+            'absmax': self.absmax,
+            'blocksize': self.blocksize,
+            'quant_map': self.code,
+            'dtype': str(self.dtype).strip('torch.'),
+            'shape': tuple(self.shape),
+        }
+        if self.nested:
+            qs_dict.update({
+                'nested_absmax': self.state2.absmax,
+                'nested_blocksize': self.state2.blocksize,
+                'nested_quant_map': self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
+                'nested_dtype': str(self.state2.dtype).strip('torch.'),
+                'nested_offset': self.offset.item(),
+            })
+        if not packed:
+            return qs_dict
+
+        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
+        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
+        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
+        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
+        return qs_packed_dict
+
+    def to(self, device):
+        # make sure the quantization state is on the right device
+        self.absmax = self.absmax.to(device)
+        if self.nested:
+            self.offset = self.offset.to(device)
+            self.state2.absmax = self.state2.absmax.to(device)
+            self.state2.code = self.state2.code.to(device)

From 0905ad743f887ef396fedb0b364cfec04d8acd26 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Thu, 8 Feb 2024 07:32:13 -0800
Subject: [PATCH 059/234] fix import issue

---
 bitsandbytes/__init__.py          | 4 +++-
 bitsandbytes/backends/__init__.py | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 512fd2455..e7eb6af6f 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,7 +17,9 @@
 
 if COMPILED_WITH_CUDA:
     from .optim import adam
-
+    from .backends import register_backend, backends
+    from .backends.cuda import CUDABackend
+    register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 0ae01a3d3..015b719cc 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,14 +1,9 @@
 from typing import Dict
 import torch
 
-from bitsandbytes.cextension import COMPILED_WITH_CUDA
 from bitsandbytes.backends.base import Backend
 
 backends: Dict[str, Backend] = {}
 
 def register_backend(backend_name: str, backend_instance: Backend):
     backends[backend_name.lower()] = backend_instance
-
-if COMPILED_WITH_CUDA:
-    from .cuda import CUDABackend
-    register_backend("cuda", CUDABackend())

From 145a8357c3063b94a15f02f12d062db6b478de1d Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Thu, 8 Feb 2024 23:33:38 +0800
Subject: [PATCH 060/234] minor clean

---
 bitsandbytes/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index e7eb6af6f..3f0db3536 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,7 +17,7 @@
 
 if COMPILED_WITH_CUDA:
     from .optim import adam
-    from .backends import register_backend, backends
+    from .backends import register_backend
     from .backends.cuda import CUDABackend
     register_backend("cuda", CUDABackend())
 __pdoc__ = {

From 7aa42beea0be0ef53a0daf6c6a63751349aeb523 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 12 Feb 2024 19:24:50 +0000
Subject: [PATCH 061/234] Enable estimate quantile tests

---
 csrc/kernels.hip         | 2 ++
 tests/test_functional.py | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 64a93cc6e..c5e11b1a1 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -630,6 +630,8 @@ __global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const f
       for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x)
           temp_storage.smem_qidx[j] = -1;
 
+      __syncthreads();
+
       if(threadIdx.x < 256)
       {
           float q_interval = (1.0f-(2.0f*offset))/255.0f;
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 565a45f3f..8d04a650f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -91,7 +91,6 @@ def setup():
 def teardown():
     pass
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dtype", [torch.float32, torch.float16], ids=["float", "half"]
 )
@@ -111,7 +110,6 @@ def test_estimate_quantiles(dtype):
     diff = torch.abs(code - quantiles)
     assert (diff > 5e-02).sum().item() == 0
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_quantile_quantization():
     for i in range(100):
         A1 = torch.randn(1024, 1024, device="cuda")
@@ -2207,7 +2205,6 @@ def test_few_bit_quant():
     #assert False
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_kbit_quantile_estimation():
     for i in range(100):
         data = torch.randn(1024, 1024, device='cuda')

From d270832cb16c8d83d4a312bc569f8ae03b6cb2b3 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Mon, 12 Feb 2024 19:33:30 -0800
Subject: [PATCH 062/234] fix CI python format

---
 bitsandbytes/__init__.py          |  2 +-
 bitsandbytes/backends/__init__.py |  1 +
 bitsandbytes/backends/cuda.py     |  1 -
 bitsandbytes/functional.py        | 10 ++++------
 bitsandbytes/utils.py             |  2 +-
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 3f0db3536..c42b4a274 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -16,9 +16,9 @@
 from .nn import modules
 
 if COMPILED_WITH_CUDA:
-    from .optim import adam
     from .backends import register_backend
     from .backends.cuda import CUDABackend
+    from .optim import adam
     register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 015b719cc..3a33d24ca 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,4 +1,5 @@
 from typing import Dict
+
 import torch
 
 from bitsandbytes.backends.base import Backend
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 248d1e4c1..6ba02d009 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -19,7 +19,6 @@
     prod,
     quantize_blockwise,
 )
-
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b75eac67e..9dbd5c1f0 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -6,19 +6,17 @@
 from functools import reduce  # Required in Python 3
 import itertools
 import operator
-from typing import Any, Dict, Optional, Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
 from torch import Tensor
 
-from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
+from bitsandbytes.backends import backends
+from bitsandbytes.utils import QuantState
 
 from .cextension import COMPILED_WITH_CUDA, lib
 
-from bitsandbytes.utils import QuantState
-
-from bitsandbytes.backends import backends
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -2064,4 +2062,4 @@ def extract_outliers(A, SA, idx):
 def pipeline_test(A, batch_size):
     out = torch.zeros_like(A)
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out
\ No newline at end of file
+    return out
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 8c42ddfed..032bb31e5 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -1,7 +1,7 @@
 import json
 import shlex
 import subprocess
-from typing import Tuple, Dict, Any
+from typing import Any, Dict, Tuple
 
 import torch
 

From 68e785908adaaf3c1b0d06fbf0fba6ce7445df12 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Thu, 15 Feb 2024 21:04:40 +0000
Subject: [PATCH 063/234] fix py38 vers incompatibility from other PR

---
 tests/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/helpers.py b/tests/helpers.py
index 46c6ef93d..f82a8631f 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -1,13 +1,13 @@
 from itertools import product
 import random
-from typing import Any
+from typing import Any, List
 
 import torch
 
 test_dims_rng = random.Random(42)
 
 
-def get_test_dims(min: int, max: int, *, n: int) -> list[int]:
+def get_test_dims(min: int, max: int, *, n: int) -> List[int]:
     return [test_dims_rng.randint(min, max) for _ in range(n)]
 
 

From 012b565dea120fe40a353493881058f6ea0a48b5 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:23:06 +0000
Subject: [PATCH 064/234] update pre-commit

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index edcbc9b6b..4fb5cf528 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.0
+    rev: v0.2.1
     hooks:
       - id: ruff
         args:
@@ -18,6 +18,6 @@ repos:
         args:
           - --fix=lf
   - repo: https://github.com/crate-ci/typos
-    rev: v1.17.2
+    rev: typos-v0.10.21
     hooks:
       - id: typos

From 8fa27f60b2f7c9fd5398a6eff8da0eafe9ed8f1e Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:23:59 +0000
Subject: [PATCH 065/234] cuda.py: harmonize whitespace

---
 bitsandbytes/backends/cuda.py | 40 ++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 6ba02d009..4b9ae4b87 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -122,10 +122,15 @@ def double_quant(
 
     def transform(self, A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
         prev_device = pre_call(A.device)
-        if state is None: state = (A.shape, from_order)
-        else: from_order = state[1]
-        if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-        else: new_state = (state[0], to_order) # (shape, order)
+        if state is None:
+            state = (A.shape, from_order)
+        else:
+            from_order = state[1]
+
+        if out is None:
+            out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
+        else:
+            new_state = (state[0], to_order) # (shape, order)
 
         shape = state[0]
         if len(shape) == 2:
@@ -141,21 +146,25 @@ def transform(self, A, to_order, from_order='row', out=None, transpose=False, st
                 lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
             else:
                 lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
+
         elif to_order == "col_turing":
             if transpose:
                 lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
             else:
                 lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
+    
         elif to_order == "col_ampere":
             if transpose:
                 lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
             else:
                 lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
+
         elif to_order == "row":
             if from_order == "col_turing":
                 lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
             elif from_order == "col_ampere":
                 lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
+
         else:
             raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
 
@@ -168,6 +177,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         shapeB = SB[0]
         dimsA = len(shapeA)
         dimsB = len(shapeB)
+
         assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
         if dimsA == 2:
             m = shapeA[0]
@@ -204,6 +214,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         assert (
             shapeA[-1] == shapeB[-1]
         ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
+
         formatB = SB[1]
         prev_device = A.device
         torch.cuda.set_device(A.device)
@@ -232,6 +243,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         has_error = 0
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
+
         if formatB == 'col_turing':
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_turing_32(
@@ -241,6 +253,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
                 has_error = lib.cigemmlt_turing_8(
                     ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
                 )
+
         elif formatB == "col_ampere":
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_ampere_32(
@@ -331,10 +344,12 @@ def extract_outliers(self, A, SA, idx):
         ptrOut = get_ptr(out)
 
         prev_device = pre_call(A.device)
+
         if formatA == 'col_turing':
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
+    
         post_call(prev_device)
 
         return out
@@ -362,7 +377,6 @@ def quantize_4bit(
             blocks += 1 if n % blocksize > 0 else 0
             absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
 
-
         if out is None:
             mod = dtype2bytes[quant_storage] * 2
             out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
@@ -377,18 +391,22 @@ def quantize_4bit(
                 lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
             else:
                 lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+
         elif A.dtype == torch.float16:
             if quant_type == 'fp4':
                 lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
             else:
                 lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+
         elif A.dtype == torch.bfloat16:
             if quant_type == 'fp4':
                 lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
             else:
                 lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+    
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
         post_call(A.device)
 
         code = get_4bit_type(quant_type, device=A.device)
@@ -399,14 +417,16 @@ def quantize_4bit(
             qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
             del absmax
             state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
+
         else:
-            state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
+            state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type)
 
         return out, state
 
     def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> torch.Tensor:
         if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
             raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+
         if quant_type not in ['fp4', 'nf4']:
             raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
 
@@ -414,11 +434,9 @@ def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = N
             assert absmax is not None and out is not None
 
             quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
-
         else:
             absmax = quant_state.absmax
 
-
         if quant_state.nested:
             absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
             absmax += quant_state.offset
@@ -431,25 +449,31 @@ def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = N
 
         device = pre_call(A.device)
         is_on_gpu([A, absmax, out])
+
         if out.dtype == torch.float32:
             if quant_state.quant_type == 'fp4':
                 lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
             else:
                 lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+
         elif out.dtype == torch.float16:
             if quant_state.quant_type == 'fp4':
                 lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
             else:
                 lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+
         elif out.dtype == torch.bfloat16:
             if quant_state.quant_type == 'fp4':
                 lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
             else:
                 lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
         post_call(A.device)
 
         is_transposed = (True if A.shape[0] == 1 else False)
+        
         if is_transposed: return out.t()
         else: return out

From 2c04d4821a90f8a0bd2b1bfff0cd73e83006d7d8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:25:02 +0000
Subject: [PATCH 066/234] delete dead code

---
 bitsandbytes/cextension.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index dab34982e..db9c05779 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -32,8 +32,3 @@
         "8-bit optimizers, 8-bit multiplication, and CUDA GPU quantization are unavailable.")
     COMPILED_WITH_CUDA = False
     print(str(ex))
-
-
-# print the setup details after checking for errors so we do not print twice
-#if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
-    #setup.print_log_stack()

From c1846557a0388c553d13a0372a2be3e0d9720acb Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:29:14 +0000
Subject: [PATCH 067/234] fix whitespace

---
 bitsandbytes/backends/cuda.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 4b9ae4b87..4fa1946e9 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -152,7 +152,7 @@ def transform(self, A, to_order, from_order='row', out=None, transpose=False, st
                 lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
             else:
                 lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
-    
+
         elif to_order == "col_ampere":
             if transpose:
                 lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
@@ -349,7 +349,7 @@ def extract_outliers(self, A, SA, idx):
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    
+
         post_call(prev_device)
 
         return out
@@ -403,7 +403,7 @@ def quantize_4bit(
                 lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
             else:
                 lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
-    
+
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
 
@@ -474,6 +474,6 @@ def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = N
         post_call(A.device)
 
         is_transposed = (True if A.shape[0] == 1 else False)
-        
+
         if is_transposed: return out.t()
         else: return out

From 03b53d7eb4558f80507155b8da148c177774d483 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:29:24 +0000
Subject: [PATCH 068/234] fix typo

---
 csrc/kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index df8488389..65aa14896 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memory block C
+//// 7. aggregate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}

From ba7a1620bef5231c8817218c0b24b580e3e80f25 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 Feb 2024 22:31:59 +0000
Subject: [PATCH 069/234] remove exstraneous import

---
 bitsandbytes/backends/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 3a33d24ca..5fb2fc130 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -1,7 +1,5 @@
 from typing import Dict
 
-import torch
-
 from bitsandbytes.backends.base import Backend
 
 backends: Dict[str, Backend] = {}

From d162998ee5b7875e8b8bbd75780106757a467fc2 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 17 Feb 2024 00:28:11 +0000
Subject: [PATCH 070/234] factor out ensure_backend_is_available, exc instead
 of assert

---
 bitsandbytes/backends/__init__.py |  5 +++++
 bitsandbytes/functional.py        | 16 ++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index 5fb2fc130..d35021b1e 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -6,3 +6,8 @@
 
 def register_backend(backend_name: str, backend_instance: Backend):
     backends[backend_name.lower()] = backend_instance
+
+def ensure_backend_is_available(device_type: str):
+    """Check if a backend is available for the given device type."""
+    if device_type.lower() not in backends:
+        raise NotImplementedError(f"Device backend for {device_type} is currently not supported.")
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 9dbd5c1f0..e94265e53 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -12,7 +12,7 @@
 import torch
 from torch import Tensor
 
-from bitsandbytes.backends import backends
+from bitsandbytes.backends import backends, ensure_backend_is_available
 from bitsandbytes.utils import QuantState
 
 from .cextension import COMPILED_WITH_CUDA, lib
@@ -834,7 +834,7 @@ def quantize_4bit(
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].quantize_4bit(A, absmax=absmax, out=out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage)
 
 def dequantize_fp4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor:
@@ -870,7 +870,7 @@ def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax:
     torch.Tensor:
         Dequantized tensor.
     """
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
 
 
@@ -1641,7 +1641,7 @@ def batched_igemm(
 
 
 def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
 
 
@@ -1655,7 +1655,7 @@ def mm_dequant(
     new_col_stats=None,
     bias=None
 ):
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
 
 
@@ -1793,12 +1793,12 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
 
 
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
 
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
 
 
@@ -2055,7 +2055,7 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
 
 
 def extract_outliers(A, SA, idx):
-    assert A.device.type in backends, f"Device backend for {A.device.type} is not supported"
+    ensure_backend_is_available(A.device.type)
     return backends[A.device.type].extract_outliers(A, SA, idx)
 
 

From fad79188b0a6a8c1adf9d5e168073d8a70d0a807 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 20 Feb 2024 04:36:08 +0000
Subject: [PATCH 071/234] Enable transpose flag for row to col transform

---
 bitsandbytes/functional.py |  8 ++------
 csrc/ops.hip               |  3 +++
 csrc/pythonInterface.c     |  6 ++++++
 tests/test_functional.py   | 21 +++++++++------------
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6fd2570b8..e62d0c49d 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -461,11 +461,7 @@ def get_transform_buffer(
         state = (shape[::-1], to_order)
 
     if to_order == "row" or to_order == "col":
-        if HIP_ENVIRONMENT and to_order == "col":
-            # row to col transformation transposes output shape, so change buffer allocation accordingly
-            return init_func(shape[::-1], dtype=dtype, device=device), state
-        else:
-            return init_func(shape, dtype=dtype, device=device), state
+        return init_func(shape, dtype=dtype, device=device), state
     elif to_order == "col32":
         # blocks of 32 columns (padded)
         cols = 32 * ((cols + 31) // 32)
@@ -503,7 +499,7 @@ def nvidia_transform(
         from_order = state[1]
     if out is None:
         out, new_state = get_transform_buffer(
-            state[0], A.dtype, A.device, to_order, state[1]
+            state[0], A.dtype, A.device, to_order, state[1], transpose
         )
     else:
         new_state = (state[1], to_order)
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 54743d111..fa66f7fc4 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -424,6 +424,9 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
 }
 
 template void transform<int8_t, ROW, COL, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int8_t, ROW, COL, true, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, ROW, COL, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int32_t, ROW, COL, true, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
 template void transform<int8_t, ROW, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int8_t, ROW, COL32, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int32_t, ROW, COL32, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index ba551dcc3..b7fdf113e 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -158,6 +158,9 @@ void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(hipblasLtHandle_t lt
 #endif
 
 MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
+MAKE_FUNC_TRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8);
+MAKE_FUNC_TRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32);
+MAKE_FUNC_TRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32);
 MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8);
 MAKE_FUNC_TRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32);
@@ -406,6 +409,9 @@ extern "C"
 
 
 	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
+	MAKE_FUNC_CTRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8)
+	MAKE_FUNC_CTRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32)
+	MAKE_FUNC_CTRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32)
 	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8d04a650f..9e4ab9fd6 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -719,19 +719,16 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())
 
-        # Since ROCm supports row to col transformation only which is same as transpose,
-        # skipping this for HIP environment
-        if not HIP_ENVIRONMENT:
-            ## transpose
-            B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
-                torch.int8
-            )
-            C1 = torch.matmul(A.float(), B.float())
+        ## transpose
+        B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
+            torch.int8
+        )
+        C1 = torch.matmul(A.float(), B.float())
 
-            B2t, SBt = F.transform(B, "col_turing", transpose=True)
-            C2, SC = F.igemmlt(A2, B2t, SA, SBt)
-            C3, S = F.nvidia_transform(C2, "row", state=SC)
-            torch.testing.assert_close(C1, C3.float())
+        B2t, SBt = F.nvidia_transform(B, "col_turing", transpose=True)
+        C2, SC = F.igemmlt(A2, B2t, SA, SBt)
+        C3, S = F.nvidia_transform(C2, "row", state=SC)
+        torch.testing.assert_close(C1, C3.float())
 
 
 dim1 = [32]

From e3021ee0f610c6ae16ae99a845a8305c230ab8ca Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 20 Feb 2024 04:36:36 +0000
Subject: [PATCH 072/234] Update descriptors for transpose flag

---
 csrc/ops.hip | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index fa66f7fc4..cfb268dec 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -383,7 +383,12 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
   hipblasLtOrder_t orderA = get_order<SRC>();
   hipblasLtOrder_t orderOut = get_order<TARGET>();
   int ldA = get_leading_dim<SRC>(dim1, dim2);
-  int ldOut = get_leading_dim<TARGET>(dim1, dim2);
+  int ldOut;
+  if (TARGET==COL && transpose) {
+    ldOut = dim2;
+  } else {
+    ldOut = get_leading_dim<TARGET>(dim1, dim2);
+  }
 
   hipblasLtMatrixLayout_t A_desc = NULL, out_desc = NULL, B_desc = NULL;
   T B = T(0);
@@ -395,13 +400,21 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
   {
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIP_R_8I, dim1, dim2, ldA));
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIP_R_8I, 0, 0, 0));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim1, dim2, ldOut));
+    if (TARGET==COL && transpose) {
+	checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim2, dim1, ldOut));
+    } else {
+	checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_8I, dim1, dim2, ldOut));
+    }
   }
   else if(DTYPE == 32)
   {
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&A_desc, HIP_R_32I, dim1, dim2, ldA));
     checkHipblasStatus(hipblasLtMatrixLayoutCreate(&B_desc, HIP_R_32I, 0, 0, 0));
-    checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim1, dim2, ldOut));
+    if (TARGET==COL && transpose) {
+	checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim2, dim1, ldOut));
+    } else {
+        checkHipblasStatus(hipblasLtMatrixLayoutCreate(&out_desc, HIP_R_32I, dim1, dim2, ldOut));
+    }
   }
   else
   {

From 8c3476f27ec8a72b380c8bfd7ea13d10f81434ac Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 20 Feb 2024 04:41:55 +0000
Subject: [PATCH 073/234] revert nvidia_transform to transform

---
 tests/test_functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 9e4ab9fd6..f914820fe 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -725,7 +725,7 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         )
         C1 = torch.matmul(A.float(), B.float())
 
-        B2t, SBt = F.nvidia_transform(B, "col_turing", transpose=True)
+        B2t, SBt = F.transform(B, "col_turing", transpose=True)
         C2, SC = F.igemmlt(A2, B2t, SA, SBt)
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())

From 5e1b152d2a907bfe8ef1aa01afe1fb82bb8fa10a Mon Sep 17 00:00:00 2001
From: "U-AMD\\zhaoyili" <zhaoyili@AUS-L1-ZHAOYILI.amd.com>
Date: Tue, 20 Feb 2024 11:58:44 -0600
Subject: [PATCH 074/234] update changes

---
 bitsandbytes/functional.py | 10 +++++-----
 csrc/kernels.hip           | 13 ++++++-------
 csrc/ops.hip               |  8 ++++----
 csrc/pythonInterface.c     | 16 ++++++++--------
 tests/test_optim.py        |  4 ++--
 5 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6fd2570b8..f8de6542d 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -30,7 +30,7 @@ def prod(iterable):
 if COMPILED_WITH_CUDA:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {}
-    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) #, lib.cadam32bit_grad_bf16)
+    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16, lib.cadam32bit_grad_bf16)
     str2optimizer32bit["momentum"] = (
         lib.cmomentum32bit_grad_32,
         lib.cmomentum32bit_grad_16,
@@ -39,7 +39,7 @@ def prod(iterable):
         lib.crmsprop32bit_grad_32,
         lib.crmsprop32bit_grad_16,
     )
-    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) #, lib.clion32bit_grad_bf16)
+    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16, lib.clion32bit_grad_bf16)
     str2optimizer32bit["adagrad"] = (
         lib.cadagrad32bit_grad_32,
         lib.cadagrad32bit_grad_16,
@@ -75,7 +75,7 @@ def prod(iterable):
     str2optimizer8bit_blockwise["adam"] = (
         lib.cadam_8bit_blockwise_grad_fp32,
         lib.cadam_8bit_blockwise_grad_fp16,
-        #lib.cadam_8bit_blockwise_grad_bf16,
+        lib.cadam_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["momentum"] = (
         lib.cmomentum_8bit_blockwise_grad_fp32,
@@ -88,7 +88,7 @@ def prod(iterable):
     str2optimizer8bit_blockwise["lion"] = (
         lib.clion_8bit_blockwise_grad_fp32,
         lib.clion_8bit_blockwise_grad_fp16,
-        #lib.clion_8bit_blockwise_grad_bf16,
+        lib.clion_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["adagrad"] = (
         lib.cadagrad_8bit_blockwise_grad_fp32,
@@ -1643,7 +1643,7 @@ def gemv_4bit(
 
     if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
         if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
+            lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
         elif A.dtype == torch.bfloat16:
             lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
         elif A.dtype == torch.float32:
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index edcde6306..458f7f1c0 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -3836,7 +3836,7 @@ MAKE_PreconditionOptimizer32bit1State(RMSPROP, half)
 MAKE_PreconditionOptimizer32bit1State(RMSPROP, float)
 MAKE_PreconditionOptimizer32bit1State(LION, half)
 MAKE_PreconditionOptimizer32bit1State(LION, float)
-//MAKE_PreconditionOptimizer32bit1State(LION, hip_bfloat16)
+MAKE_PreconditionOptimizer32bit1State(LION, hip_bfloat16)
 MAKE_PreconditionOptimizer32bit1State(ADAGRAD, half)
 MAKE_PreconditionOptimizer32bit1State(ADAGRAD, float)
 
@@ -3850,7 +3850,7 @@ MAKE_Optimizer32bit1State(RMSPROP, half)
 MAKE_Optimizer32bit1State(RMSPROP, float)
 MAKE_Optimizer32bit1State(LION, half)
 MAKE_Optimizer32bit1State(LION, float)
-//MAKE_Optimizer32bit1State(LION, hip_bfloat16)
+MAKE_Optimizer32bit1State(LION, hip_bfloat16)
 MAKE_Optimizer32bit1State(ADAGRAD, half)
 MAKE_Optimizer32bit1State(ADAGRAD, float)
 
@@ -3862,16 +3862,15 @@ template __global__ void kPreconditionOptimizer32bit2State<gtype, oname, 4096, 8
 
 MAKE_PreconditionOptimizer32bit2State(ADAM, float)
 MAKE_PreconditionOptimizer32bit2State(ADAM, half)
-//MAKE_PreconditionOptimizer32bit2State(ADAM, hip_bfloat16)
+MAKE_PreconditionOptimizer32bit2State(ADAM, hip_bfloat16)
 
 template __global__ void kOptimizer32bit2State<float, ADAM>(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 template __global__ void kOptimizer32bit2State<half, ADAM>(half* g, half* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
-/*
 template __global__ void kOptimizer32bit2State<hip_bfloat16, ADAM>(hip_bfloat16* g, hip_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
     const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
-*/
+
 
 #define MAKE_PreconditionStatic8bit1State(oname, gtype) \
 template __global__ void kPreconditionOptimizerStatic8bit1State<gtype, oname>(gtype* p, gtype* __restrict__ const g, unsigned char*__restrict__  const state1,  \
@@ -4040,7 +4039,7 @@ template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block
 
 MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, float, 2048, 8)
 MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, half, 2048, 8)
-//MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, hip_bfloat16, 2048, 8)
+MAKE_OptimizerStatic8bit2StateBlockwise(ADAM, hip_bfloat16, 2048, 8)
 
 
 #define MAKE_OptimizerStatic8bit1StateBlockwise(oname, gtype, block_size, num_per_thread) \
@@ -4059,6 +4058,6 @@ MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(RMSPROP, half, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(LION, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(LION, half, 2048, 8)
-//MAKE_OptimizerStatic8bit1StateBlockwise(LION, hip_bfloat16, 2048, 8)
+MAKE_OptimizerStatic8bit1StateBlockwise(LION, hip_bfloat16, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, float, 2048, 8)
 MAKE_OptimizerStatic8bit1StateBlockwise(ADAGRAD, half, 2048, 8)
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 54743d111..64b7bf908 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -966,14 +966,14 @@ template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
 
 MAKE_optimizer32bit(ADAM, half)
 MAKE_optimizer32bit(ADAM, float)
-//MAKE_optimizer32bit(ADAM, hip_bfloat16)
+MAKE_optimizer32bit(ADAM, hip_bfloat16)
 MAKE_optimizer32bit(MOMENTUM, half)
 MAKE_optimizer32bit(MOMENTUM, float)
 MAKE_optimizer32bit(RMSPROP, half)
 MAKE_optimizer32bit(RMSPROP, float)
 MAKE_optimizer32bit(LION, half)
 MAKE_optimizer32bit(LION, float)
-//MAKE_optimizer32bit(LION, hip_bfloat16)
+MAKE_optimizer32bit(LION, hip_bfloat16)
 MAKE_optimizer32bit(ADAGRAD, half)
 MAKE_optimizer32bit(ADAGRAD, float)
 
@@ -1009,11 +1009,11 @@ MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(half, LION);
 MAKE_optimizerStatic8bitBlockwise(float, LION);
-//MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
 MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
 MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
 
 template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
 template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
 
-//MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index ba551dcc3..c06da38b6 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -60,12 +60,12 @@ MAKE_FUNC32(momentum, MOMENTUM, float, 32)
 MAKE_FUNC32(momentum, MOMENTUM, half, 16)
 MAKE_FUNC32(adam, ADAM, float, fp32)
 MAKE_FUNC32(adam, ADAM, half, fp16)
-//MAKE_FUNC32(adam, ADAM, hip_bfloat16, bf16)
+MAKE_FUNC32(adam, ADAM, hip_bfloat16, bf16)
 MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
 MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
 MAKE_FUNC32(lion, LION, float, fp32)
 MAKE_FUNC32(lion, LION, half, fp16)
-//MAKE_FUNC32(lion, LION, hip_bfloat16, bf16)
+MAKE_FUNC32(lion, LION, hip_bfloat16, bf16)
 MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
 MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
 
@@ -105,10 +105,10 @@ MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
 MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-//MAKE_BLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
+MAKE_BLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
 MAKE_BLOCKWISE8(lion, LION, half, fp16)
 MAKE_BLOCKWISE8(lion, LION, float, fp32)
-//MAKE_BLOCKWISE8(lion, LION, hip_bfloat16, bf16)
+MAKE_BLOCKWISE8(lion, LION, hip_bfloat16, bf16)
 
 
 void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
@@ -272,14 +272,14 @@ extern "C"
 
 	MAKE_CFUNC32(adam, float, fp32)
 	MAKE_CFUNC32(adam, half, fp16)
-	//MAKE_CFUNC32(adam, hip_bfloat16, bf16)
+	MAKE_CFUNC32(adam, hip_bfloat16, bf16)
 	MAKE_CFUNC32(momentum, float, 32)
 	MAKE_CFUNC32(momentum, half, 16)
 	MAKE_CFUNC32(rmsprop, float, 32)
 	MAKE_CFUNC32(rmsprop, half, 16)
 	MAKE_CFUNC32(lion, float, fp32)
 	MAKE_CFUNC32(lion, half, fp16)
-	//MAKE_CFUNC32(lion, hip_bfloat16, bf16)
+	MAKE_CFUNC32(lion, hip_bfloat16, bf16)
 	MAKE_CFUNC32(adagrad, float, 32)
 	MAKE_CFUNC32(adagrad, half, 16)
 
@@ -319,10 +319,10 @@ extern "C"
 	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-	//MAKE_CBLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
+	MAKE_CBLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
 	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
 	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
-	//MAKE_CBLOCKWISE8(lion, LION, hip_bfloat16, bf16)
+	MAKE_CBLOCKWISE8(lion, LION, hip_bfloat16, bf16)
 
 	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
 	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 2724436e5..73f229ef1 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -110,7 +110,7 @@ def rm_path(path):
 optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values]
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
+#@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer32bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip()
@@ -253,7 +253,7 @@ def test_global_config(dim1, dim2, gtype):
 ]
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
+#@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip()

From 2cd9718cdff2fc5da4a19ff7c912426b93f8f094 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 21 Feb 2024 16:56:38 +0800
Subject: [PATCH 075/234] Remove minor device filter to avoid confusion

---
 bitsandbytes/autograd/_functions.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index edf330f14..6cbb6efd9 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -230,10 +230,6 @@ def supports_igemmlt(device: torch.device) -> bool:
     nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660')  # https://en.wikipedia.org/wiki/GeForce_16_series
     if any(model_name in device_name for model_name in nvidia16_models):
         return False  # these devices are technically cuda 7.5-capable, but they lack tensor cores
-    if device.type == "cpu":
-        #TODO: will return True once CPU backend upstream the supports
-        return False
-
     return True
 
 
@@ -568,7 +564,7 @@ def matmul(
 
 def matmul_4bit(A: torch.Tensor, B: torch.Tensor, quant_state: F.QuantState, out: Optional[torch.Tensor] = None, bias=None):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type == "cuda":
+    if A.numel() == A.shape[-1] and A.requires_grad == False:
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
             return MatMul4Bit.apply(A, B, out, bias, quant_state)

From 389bb7d086a32d220decd0226f4a1344aee21698 Mon Sep 17 00:00:00 2001
From: "U-AMD\\zhaoyili" <zhaoyili@AUS-L1-ZHAOYILI.amd.com>
Date: Fri, 23 Feb 2024 16:54:14 -0600
Subject: [PATCH 076/234] fixed minor mistakes

---
 bitsandbytes/functional.py | 2 +-
 tests/test_optim.py        | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f8de6542d..ec25be2a5 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1643,7 +1643,7 @@ def gemv_4bit(
 
     if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
         if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
+            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
         elif A.dtype == torch.bfloat16:
             lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
         elif A.dtype == torch.float32:
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 73f229ef1..c373a4f14 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -110,7 +110,6 @@ def rm_path(path):
 optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion']
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values]
-#@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer32bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip()
@@ -253,7 +252,6 @@ def test_global_config(dim1, dim2, gtype):
 ]
 
 
-#@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names)
 def test_optimizer8bit(dim1, dim2, gtype, optim_name):
     if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip()

From fa2882814794cb3d71a518fcb137601f6188ba32 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 6 Mar 2024 19:14:12 +0000
Subject: [PATCH 077/234] remove blocksize 64 on rocm

---
 bitsandbytes/functional.py | 52 +++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2146176ed..281d216a0 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -740,7 +740,10 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
         out = torch.zeros_like(A, dtype=torch.uint8)
 
     if A.device.type != 'cpu':
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+        if not HIP_ENVIRONMENT:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        else:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128]
         cblocksize = ct.c_int32(blocksize)
         prev_device = pre_call(A.device)
         code = code.to(A.device)
@@ -825,8 +828,11 @@ def dequantize_blockwise(
     if A.device.type != 'cpu':
         device = pre_call(A.device)
         code = quant_state.code.to(A.device)
-        if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128]:
-            raise ValueError(f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128]")
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if HIP_ENVIRONMENT:
+            supported_blocksizes = supported_blocksizes[:-1]
+        if quant_state.blocksize not in supported_blocksizes:
+            raise ValueError(f"The blockwise of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}")
         is_on_gpu([A, absmax, out])
         if out.dtype == torch.float32:
             lib.cdequantize_blockwise_fp32(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
@@ -894,13 +900,17 @@ def get_4bit_type(typename, device=None, blocksize=64):
     return data.to(device)
 
 
-def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False, quant_storage=torch.uint8):
+def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=None, compress_statistics=False, quant_storage=torch.uint8):
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4', quant_storage)
 
-def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False, quant_storage=torch.uint8):
+def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=None, compress_statistics=False, quant_storage=torch.uint8):
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage)
 
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=128, compress_statistics=False, quant_type='fp4', quant_storage=torch.uint8) -> Tensor:
+def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=None, compress_statistics=False, quant_type='fp4', quant_storage=torch.uint8) -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
 
@@ -926,6 +936,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     if A.device.type != 'cuda':
         raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
     if quant_type not in ['fp4', 'nf4']:
@@ -944,7 +956,10 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
         mod = dtype2bytes[quant_storage] * 2
         out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
 
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+    if not HIP_ENVIRONMENT:
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    else:
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128]
 
     prev_device = pre_call(A.device)
     is_on_gpu([A, out, absmax])
@@ -981,13 +996,19 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
 
     return out, state
 
-def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128) -> Tensor:
+def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = None) -> Tensor:
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
+
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
 
-def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128) -> Tensor:
+def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = None) -> Tensor:
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
+    
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
 
-def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 128, quant_type='fp4') -> Tensor:
+def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = None, quant_type='fp4') -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.
 
@@ -1014,8 +1035,15 @@ def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor =
     torch.Tensor:
         Dequantized tensor.
     """
-    if blocksize not in [2048, 4096, 1024, 512, 256, 128]:
-        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128]")
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
+    
+    supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+    if HIP_ENVIRONMENT:
+        supported_blocksizes = supported_blocksizes[:-1]
+
+    if blocksize not in supported_blocksizes:
+        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}")
     if quant_type not in ['fp4', 'nf4']:
         raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
 

From d86d24cbe674087d80deb22fac2dc38538b3d800 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 6 Mar 2024 20:14:13 +0000
Subject: [PATCH 078/234] remove block size 64 and enable remaining tests

---
 bitsandbytes/research/autograd/_functions.py |  6 +++++-
 tests/test_autograd.py                       |  1 -
 tests/test_functional.py                     | 13 ++++++++-----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index 0dff351e0..883121759 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -8,6 +8,7 @@
 import bitsandbytes.functional as F
 
 from bitsandbytes.autograd._functions import MatmulLtState, GlobalOutlierPooler
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 
 # math.prod not compatible with python < 3.8
@@ -376,7 +377,10 @@ def backward(ctx, grad_output):
 def get_block_sizes(input_matrix, weight_matrix):
     input_features = input_matrix.shape[-1]
     output_features = (weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1])
-    array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
+    if not HIP_ENVIRONMENT:
+        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
+    else:
+        array = [4096, 2048, 1024, 512, 256, 128, 0]
     bsz, bsz2 = 1024, 1024
     for i, k in enumerate(array):
         if input_features > array[i + 1]:
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 4c7e2b9df..29265cbfe 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -549,7 +549,6 @@ def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose,
 values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose))
 str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose))
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values]
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names)
 def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
     dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index f914820fe..34e395e75 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -153,11 +153,14 @@ def test_dynamic_quantization():
         assert diff < 0.004
 
 
-
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
+def get_blocksizes(hip_env=False):
+    if not hip_env:
+        return [4096, 2048, 1024, 512, 256, 128, 64]
+    else:
+        return [4096, 2048, 1024, 512, 256, 128]
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
 @pytest.mark.parametrize("nested", [False, True], ids=["False", "True"])
-@pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
+@pytest.mark.parametrize("blocksize", get_blocksizes(HIP_ENVIRONMENT))
 @pytest.mark.parametrize("signed", [True, False], ids=['signed_True', 'signed_False'])
 def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed):
     #print('')
@@ -2283,10 +2286,10 @@ def test_fp4_quant(dtype):
     assert relerr.item() < 0.28
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
 def test_4bit_compressed_stats(quant_type):
-    for blocksize in [128, 64]:
+    blocksizes = [128, 64] if not HIP_ENVIRONMENT else [128]
+    for blocksize in blocksizes:
         errs1 = []
         errs2 = []
         for i in range(10):

From cf4a506671cb8b1bd88e9e9a1377f21c2fdaa282 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 6 Mar 2024 22:19:09 +0000
Subject: [PATCH 079/234] Fix cuda build errors

---
 csrc/pythonInterface.c | 85 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 9 deletions(-)

diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 6b27de9b3..c74357758 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -34,8 +34,13 @@ void gemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  floa
 void gemm_4bit_inference_naive_fp16(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize)
 { gemm_4bit_inference_naive<half, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
+#if defined(BUILD_CUDA)
+void gemm_4bit_inference_naive_bf16(int m, int n, int k, __nv_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, __nv_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
+{ gemm_4bit_inference_naive<__nv_bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
+#elif defined(BUILD_HIP)
 void gemm_4bit_inference_naive_bf16(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
 { gemm_4bit_inference_naive<hip_bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
+#endif
 
 void gemm_4bit_inference_naive_fp32(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize)
 { gemm_4bit_inference_naive<float, 32>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
@@ -60,12 +65,20 @@ MAKE_FUNC32(momentum, MOMENTUM, float, 32)
 MAKE_FUNC32(momentum, MOMENTUM, half, 16)
 MAKE_FUNC32(adam, ADAM, float, fp32)
 MAKE_FUNC32(adam, ADAM, half, fp16)
+#if defined(BUILD_CUDA)
+MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16)
+#elif defined(BUILD_HIP)
 MAKE_FUNC32(adam, ADAM, hip_bfloat16, bf16)
+#endif
 MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
 MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
 MAKE_FUNC32(lion, LION, float, fp32)
 MAKE_FUNC32(lion, LION, half, fp16)
+#if defined(BUILD_CUDA)
+MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16)
+#elif defined(BUILD_HIP)
 MAKE_FUNC32(lion, LION, hip_bfloat16, bf16)
+#endif
 MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
 MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
 
@@ -105,11 +118,18 @@ MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
 MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
+#if defined(BUILD_CUDA)
+MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+#elif defined(BUILD_HIP)
 MAKE_BLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
+#endif
 MAKE_BLOCKWISE8(lion, LION, half, fp16)
 MAKE_BLOCKWISE8(lion, LION, float, fp32)
+#if defined(BUILD_CUDA)
+MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+#elif defined(BUILD_HIP)
 MAKE_BLOCKWISE8(lion, LION, hip_bfloat16, bf16)
-
+#endif
 
 void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
 void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
@@ -118,9 +138,15 @@ void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char
 void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 
+#if defined(BUILD_CUDA)
+void quantizeBlockwise_bf16(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_bf16_fp4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+void quantizeBlockwise_bf16_nf4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<__nv_bfloat16, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+#elif defined(BUILD_HIP)
 void quantizeBlockwise_bf16(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_bf16_fp4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_bf16_nf4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<hip_bfloat16, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
+#endif
 
 void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
 void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
@@ -134,10 +160,15 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa
 void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, FP4>(NULL, A, absmax, out, blocksize, n); }
 void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
 
+#if defined(BUILD_CUDA)
+void dequantizeBlockwise_bf16(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, General8bit>(code, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, FP4>(NULL, A, absmax, out, blocksize, n); }
+void dequantizeBlockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<__nv_bfloat16, NF4>(NULL, A, absmax, out, blocksize, n); }
+#elif defined(BUILD_HIP)
 void dequantizeBlockwise_bf16(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, General8bit>(code, A, absmax, out, blocksize, n); }
 void dequantizeBlockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, FP4>(NULL, A, absmax, out, blocksize, n); }
 void dequantizeBlockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise<hip_bfloat16, NF4>(NULL, A, absmax, out, blocksize, n); }
-
+#endif
 
 #ifndef NO_HIPBLASLT
 #if BUILD_CUDA
@@ -158,9 +189,6 @@ void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(hipblasLtHandle_t lt
 #endif
 
 MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8);
-MAKE_FUNC_TRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32);
-MAKE_FUNC_TRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32);
 MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8);
 MAKE_FUNC_TRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32);
@@ -168,9 +196,15 @@ MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
 MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
+
+#if defined(BUILD_HIP)
+MAKE_FUNC_TRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8);
+MAKE_FUNC_TRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32);
+MAKE_FUNC_TRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32);
 MAKE_FUNC_TRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32);
 #endif
+#endif
 
 void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
 void transform_row2col32T(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 1>(A, out, rows, cols); }
@@ -258,6 +292,16 @@ extern "C"
   void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
   void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
 
+#if defined(BUILD_CUDA)
+  void cquantize_blockwise_bf16(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_bf16_fp4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
+  void cquantize_blockwise_bf16_nf4(float * code, __nv_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
+
+  void cdequantize_blockwise_bf16(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
+
+#elif defined(BUILD_HIP)
   void cquantize_blockwise_bf16(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
   void cquantize_blockwise_bf16_fp4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
   void cquantize_blockwise_bf16_nf4(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
@@ -265,7 +309,7 @@ extern "C"
   void cdequantize_blockwise_bf16(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16(code, A, absmax, out, blocksize, n); }
   void cdequantize_blockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_fp4(code, A, absmax, out, blocksize, n); }
   void cdequantize_blockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n){ dequantizeBlockwise_bf16_nf4(code, A, absmax, out, blocksize, n); }
-
+#endif
 	#define MAKE_CFUNC32(name, gtype, gbits) \
 	void c##name##32bit_grad_##gbits(gtype *g, gtype *p, \
 								 float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
@@ -275,14 +319,22 @@ extern "C"
 
 	MAKE_CFUNC32(adam, float, fp32)
 	MAKE_CFUNC32(adam, half, fp16)
+	#if defined(BUILD_CUDA)
+	MAKE_CFUNC32(adam, __nv_bfloat16, bf16)
+	#elif defined(BUILD_HIP)
 	MAKE_CFUNC32(adam, hip_bfloat16, bf16)
+	#endif
 	MAKE_CFUNC32(momentum, float, 32)
 	MAKE_CFUNC32(momentum, half, 16)
 	MAKE_CFUNC32(rmsprop, float, 32)
 	MAKE_CFUNC32(rmsprop, half, 16)
 	MAKE_CFUNC32(lion, float, fp32)
 	MAKE_CFUNC32(lion, half, fp16)
+	#if defined(BUILD_CUDA)
+	MAKE_CFUNC32(lion, __nv_bfloat16, bf16)
+	#elif defined(BUILD_HIP)
 	MAKE_CFUNC32(lion, hip_bfloat16, bf16)
+	#endif
 	MAKE_CFUNC32(adagrad, float, 32)
 	MAKE_CFUNC32(adagrad, half, 16)
 
@@ -322,10 +374,18 @@ extern "C"
 	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
 	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
+	#if defined(BUILD_CUDA)
+	MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
+	#elif defined(BUILD_HIP)
 	MAKE_CBLOCKWISE8(adam, ADAM, hip_bfloat16, bf16)
+	#endif
 	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
 	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
+	#if defined(BUILD_CUDA)
+	MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
+	#elif defined(BUILD_HIP)
 	MAKE_CBLOCKWISE8(lion, LION, hip_bfloat16, bf16)
+	#endif
 
 	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
 	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
@@ -409,9 +469,6 @@ extern "C"
 
 
 	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8)
-	MAKE_FUNC_CTRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32)
-	MAKE_FUNC_CTRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32)
 	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32)
@@ -419,8 +476,14 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
+
+	#if defined(BUILD_HIP)
+	MAKE_FUNC_CTRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8)
+	MAKE_FUNC_CTRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32)
+	MAKE_FUNC_CTRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32)
 	MAKE_FUNC_CTRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32)
+	#endif
 #endif
 	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
 	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
@@ -531,7 +594,11 @@ extern "C"
 	void cgemm_4bit_inference_naive_fp16(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize)
 	{ gemm_4bit_inference_naive_fp16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
+	#if defined(BUILD_CUDA)
+	void cgemm_4bit_inference_naive_bf16(int m, int n, int k, __nv_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, __nv_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
+	#elif defined(BUILD_HIP)
 	void cgemm_4bit_inference_naive_bf16(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize)
+	#endif
 	{ gemm_4bit_inference_naive_bf16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 	void cgemm_4bit_inference_naive_fp32(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize)

From 707719568b194530f45795711a1122a8dcfff9b5 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 02:02:34 +0000
Subject: [PATCH 080/234] remove workspace in igemmlt

---
 csrc/ops.hip | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 27e479573..2693ffa63 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -536,11 +536,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     else
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
 
-    //Set User Preference attributes
-    int64_t max_workspace_size = 32 * 1024 * 1024 * 4;
-    void*    d_workspace;
-    //NEED HIP CHECK ERROR
-    //hipMalloc(&d_workspace, max_workspace_size);
+    const int64_t max_workspace_size = 0;//set to 0 to avoid choosing GSU kernel
 
     if(DTYPE_OUT == 32)
     {
@@ -580,17 +576,14 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
                                                             heuristicResult,
                                                             &returnedAlgoCount));
 
-      auto toMalloc = max(heuristicResult[0].workspaceSize, max_workspace_size);
-
-      //printf("\n\n1Got algosn: %d %d %d\n\n",returnedAlgoCount, heuristicResult[0].workspaceSize, toMalloc);
-      //NEED HIP CHECK ERROR
-      auto err = hipMalloc(&d_workspace, toMalloc);
-      //printf("Hipmalloc\n");
-      //printf(hipError_to_string(err).c_str());
-      //printf("\n");
-      has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, toMalloc, 0));
-//hipStreamSynchronize(0);
-      hipFree(d_workspace);
+      if (returnedAlgoCount == 0)
+      {
+        has_error = 1;
+      }
+      else
+      {
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+      }
     }
     else
     {
@@ -622,23 +615,19 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
                                                             heuristicResult,
                                                             &returnedAlgoCount));
 
-      //NEED HIP CHECK ERROR
-      hipMalloc(&d_workspace, heuristicResult[0].workspaceSize);
       if(!SCALE_ROWS)
       {
         float alpha = 1.0f, beta = 0.0f;
 
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, max_workspace_size, 0));
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
       }
       else
       {
         //has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
 	float beta = 0.0f;
 
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, d_workspace, max_workspace_size, 0));
+        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
       }
-
-      hipFree(d_workspace);
     }
 
 

From ec32fc1c4067e6fb1f3f98a2bbccc859e796afdb Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:06:17 +0000
Subject: [PATCH 081/234] Enabled igemmlt in matmul

---
 bitsandbytes/autograd/_functions.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index c8d50ea86..59b0ac7b2 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -224,10 +224,8 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    """Important: Could I use igemmlt on ROCm? """
     if torch.version.hip:
-        #Well, lets currently disable it
-        return False
+        return True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)

From 4536b251209cbb2b6085a85ed24c8895a41ead0d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:08:21 +0000
Subject: [PATCH 082/234] Fix shape issue in transform function

---
 bitsandbytes/functional.py | 4 ++--
 tests/test_functional.py   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2146176ed..a0f2a6c49 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -452,13 +452,13 @@ def get_transform_buffer(
         rows = shape[0] * shape[1]
     cols = shape[-1]
 
-    state = (shape, to_order)
     if transpose:
         # swap dims
         tmp = rows
         rows = cols
         cols = tmp
-        state = (shape[::-1], to_order)
+        shape = shape[::-1]
+    state = (shape, to_order)
 
     if to_order == "row" or to_order == "col":
         return init_func(shape, dtype=dtype, device=device), state
diff --git a/tests/test_functional.py b/tests/test_functional.py
index f914820fe..05b52103e 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1309,7 +1309,6 @@ def test_row_scale_bench(dim1, dim4, inner):
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",
     values,

From 66e34c18d5cac3b28b46f09f00da2f25d41dac7f Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:11:50 +0000
Subject: [PATCH 083/234] Enable igemmlt int8 output

---
 bitsandbytes/functional.py | 7 +++++--
 csrc/ops.hip               | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index a0f2a6c49..d19f88f83 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2524,7 +2524,10 @@ def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
 def extract_outliers(A, SA, idx):
     shapeA = SA[0]
     formatA = SA[1]
-    assert formatA in ["col_turing", "col_ampere"]
+    if not HIP_ENVIRONMENT:
+        assert formatA in ["col_turing", "col_ampere"]
+    else:
+        assert formatA in ["col"]
     assert A.device.type == "cuda"
 
     out = torch.zeros(
@@ -2539,7 +2542,7 @@ def extract_outliers(A, SA, idx):
     ptrOut = get_ptr(out)
 
     prev_device = pre_call(A.device)
-    if formatA == 'col_turing':
+    if formatA == 'col_turing' or HIP_ENVIRONMENT:
         lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
     elif formatA == "col_ampere":
         lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 27e479573..8e1347840 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -594,7 +594,9 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     }
     else
     {
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_32F));
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_8I));
+      hipblasOperation_t opA = HIPBLAS_OP_N;
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
       has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIP_R_8I, m, n, ldc));
       has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));

From 7e5e223118fe2ce336613630721f7663c7e1530e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:24:06 +0000
Subject: [PATCH 084/234] Add col format for extract outliers

---
 csrc/kernels.hip | 13 ++++++++++++-
 csrc/ops.hip     |  7 +++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 458f7f1c0..50e66d87d 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -2974,7 +2974,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 {
 	int local_colidx = idx[blockIdx.x];
 
-	if(FORMAT==COL_TURING)
+	/*if(FORMAT==COL_TURING)
 	{
 		// TURING FORMAT:
 		// 8*32 tiles with 4*4 subtiles
@@ -3030,6 +3030,17 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 			int out_idx = (row*idx_size) + blockIdx.x;
 			out[out_idx] = val;
 		}
+	}*/
+
+	//Only col format is used on ROCm
+	for(int row = threadIdx.x; row < rowsA; row+= blockDim.x)
+	{
+		//col-major offset
+		int offset = local_colidx * rowsA + row;
+	
+		char val = A[offset];
+		int out_idx = (row*idx_size) + blockIdx.x;
+		out[out_idx] = val;
 	}
 }
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 8e1347840..41b29ba39 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -833,14 +833,17 @@ template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int id
 
 	int num_blocks = idx_size;
 
-  if(FORMAT == COL_TURING)
+  /*if(FORMAT == COL_TURING)
   {
       tiledRows = fill_up_to_nearest_multiple(rows, 8);
   }
   else if(FORMAT == COL_AMPERE)
   {
       tiledRows = fill_up_to_nearest_multiple(rows, 32);
-	}
+  }*/
+
+  //for col format on ROCm
+  tiledRows = rows;
 
  hipLaunchKernelGGL(( kExtractOutliers<FORMAT>), dim3(num_blocks), dim3(threads), 0, 0, A, idx, out, idx_size, rows, cols, tiledRows, tiledCols);
   CUDA_CHECK_RETURN(hipPeekAtLastError());

From 2e42adb8c7993f466d63cff3b81accf007701610 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:25:56 +0000
Subject: [PATCH 085/234] Enable dequant_mm

---
 bitsandbytes/functional.py |  2 ++
 csrc/kernels.hip           | 57 ++++++++++++++++++++++++++++----------
 csrc/ops.hip               | 19 +++++++------
 tests/test_functional.py   |  1 -
 4 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index d19f88f83..8858e846a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1952,6 +1952,8 @@ def mm_dequant(
     new_col_stats=None,
     bias=None
 ):
+    if HIP_ENVIRONMENT:
+        A, quant_state = nvidia_transform(A, "row", state = quant_state)
     assert A.dtype == torch.int32
     if bias is not None: assert bias.dtype == torch.float16
     out_shape = quant_state[0]
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 50e66d87d..723504fa8 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -2300,13 +2300,16 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 
   const int n_out = numRows*numCols;
 
-  int num_row_tiles = (numRows/SUBTILE_ROWS) + (numRows % SUBTILE_ROWS == 0 ? 0 : 1);
+  //int num_row_tiles = (numRows/SUBTILE_ROWS) + (numRows % SUBTILE_ROWS == 0 ? 0 : 1);
   // we have tiles of size numRows*32, thus col only increases every numRows
   // num_row_tiles is the tiles after which the column increases by 32
   // blockIdx.x is the index of the current tile
-  int col = ((threadIdx.x % 32) + ((blockIdx.x/num_row_tiles)*32));
+  //int col = ((threadIdx.x % 32) + ((blockIdx.x/num_row_tiles)*32));
   // base_row increases by SUBTILE_ROWS every block. It wraps back to zero once num_row_tiles is reached
-  int base_row = (blockIdx.x*SUBTILE_ROWS) % (num_row_tiles*SUBTILE_ROWS);
+  //int base_row = (blockIdx.x*SUBTILE_ROWS) % (num_row_tiles*SUBTILE_ROWS);
+
+  int block_offset = blockIdx.x * THREADS * ITEMS_PER_THREAD;
+  int thread_offset = threadIdx.x * ITEMS_PER_THREAD;
 
   // SUBTILE_ROWS is independent from ITEMS_PER_THREAD is independent from THREADS
   // subtiles have 32*SUBTILE_ROWS elements <= THREADS*ITEMS_PER_THREAD
@@ -2321,20 +2324,33 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 
   int local_values[ITEMS_PER_THREAD];
   half local_output[ITEMS_PER_THREAD];
-  float local_rowStats[ITEMS_PER_THREAD];
-  __shared__ float smem_rowStats[SUBTILE_ROWS];
+  //float local_rowStats[ITEMS_PER_THREAD];
+  //__shared__ float smem_rowStats[SUBTILE_ROWS];
 
   typedef hipcub::BlockLoad<int, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_DIRECT> LoadInt32;
-  typedef hipcub::BlockExchange<int, THREADS, ITEMS_PER_THREAD> ExchangeInt32;
+  //typedef hipcub::BlockExchange<int, THREADS, ITEMS_PER_THREAD> ExchangeInt32;
   __shared__ typename LoadInt32::TempStorage loadint32;
-  __shared__ typename ExchangeInt32::TempStorage exchangeint32;
+  //__shared__ typename ExchangeInt32::TempStorage exchangeint32;
 
 
   // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
-  float colStat = col >= numCols ? 0.0f : colStats[col];
-  float local_biasValue = ((bias == NULL) || (col >= numCols)) ? 0.0f : __half2float(bias[col]);
+  //float colStat = col >= numCols ? 0.0f : colStats[col];
+  //float local_biasValue = ((bias == NULL) || (col >= numCols)) ? 0.0f : __half2float(bias[col]);
+  int row_idx, col_idx;
+  float colStat[ITEMS_PER_THREAD];
+  float local_biasValue[ITEMS_PER_THREAD];
+  float rowStat[ITEMS_PER_THREAD];
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+  {
+       row_idx = (block_offset + thread_offset + j) / numCols;
+       col_idx = (block_offset + thread_offset + j) % numCols;
+       colStat[j] = col_idx >= numCols ? 0.0f : colStats[col_idx];
+       local_biasValue[j] = ((bias == NULL) || (col_idx >= numCols)) ? 0.0f : __half2float(bias[col_idx]);
+       rowStat[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx];
+  }
   // no block loads for rows for now -- keep it simple
-  for(int j = threadIdx.x; j < SUBTILE_ROWS; j+=blockDim.x)
+  /*for(int j = threadIdx.x; j < SUBTILE_ROWS; j+=blockDim.x)
   {
     // todo: is this global mem access slow due to overlaps or does the L1 cache work well here?
     int row = (base_row+j) % numRows; // wrap around
@@ -2342,12 +2358,25 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     // todo: update description about striped shared memory, it is not needed
     // rowidx: [0, 1, 2, 3...] and each warp reads ITEMS_PER_THREAD consequitive elements
     smem_rowStats[j] = rowStats[row];
-  }
+  }*/
   __syncthreads();
 
+  int valid_items = block_offset + THREADS * ITEMS_PER_THREAD < n_out ? THREADS * ITEMS_PER_THREAD : n_out - block_offset;
+  LoadInt32(loadint32).Load(&(A[block_offset]), local_values, valid_items, 0);
 
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+    local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*rowStat[j]*colStat[j]) + local_biasValue[j]);
+ 
   // each block processes SUBTILE_ROWS*32 elements
-  const int items_per_load = THREADS*ITEMS_PER_THREAD;
+  #pragma unroll ITEMS_PER_THREAD
+  for(int j = 0; j < ITEMS_PER_THREAD; j++)
+  {
+    int outIdx = block_offset + thread_offset + j;
+    if(outIdx< n_out)
+      out[outIdx] = local_output[j];
+  }
+  /*const int items_per_load = THREADS*ITEMS_PER_THREAD;
   const int rows_per_load = items_per_load/32;
 
   int subtile_base_row = (threadIdx.x / 32)*ITEMS_PER_THREAD; // row within the tile
@@ -2368,7 +2397,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     #pragma unroll ITEMS_PER_THREAD
     for(int j = 0; j < ITEMS_PER_THREAD; j++)
       local_rowStats[j] = smem_rowStats[subtile_base_row+row_offset+j];
-
+	
     #pragma unroll ITEMS_PER_THREAD
     for(int j = 0; j < ITEMS_PER_THREAD; j++)
       local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*local_rowStats[j]*colStat) + local_biasValue);
@@ -2388,7 +2417,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     }
 
     row_offset += rows_per_load;
-  }
+  }*/
 }
 
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 41b29ba39..aa16e9c3f 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -664,14 +664,17 @@ int fill_up_to_nearest_multiple(int value, int multiple)
 void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half *bias, int numRows, int numCols)
 {
   int threads = 512;
-  int tileCols = fill_up_to_nearest_multiple(numCols, 32);
-  int n = numRows*tileCols;
-  int subtile_rows = 128;
-  int tilesize = 32*subtile_rows;
-  int num_blocks = numRows/subtile_rows;
-  num_blocks += (numRows % subtile_rows == 0) ? 0 : 1;
-  num_blocks = num_blocks*(tileCols/32);
-  assert(threads <= tilesize);
+  //int tileCols = fill_up_to_nearest_multiple(numCols, 32);
+  //int n = numRows*tileCols;
+  int n = numRows*numCols;
+  //int subtile_rows = 128;
+  //int tilesize = 32*subtile_rows;
+  //int num_blocks = numRows/subtile_rows;
+  //num_blocks += (numRows % subtile_rows == 0) ? 0 : 1;
+  //num_blocks = num_blocks*(tileCols/32);
+  //assert(threads <= tilesize);
+  int num_blocks = numRows * numCols / (threads * 4);
+  num_blocks += (numRows * numCols) % (threads * 4) == 0 ? 0 : 1;
 
  hipLaunchKernelGGL(( kdequant_mm_int32_fp16<4, 128, 512>), dim3(num_blocks), dim3(threads), 0, 0, A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols, tileCols, n);
   CUDA_CHECK_RETURN(hipPeekAtLastError());
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 05b52103e..6d7ace64b 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -967,7 +967,6 @@ def test_bench_8bit_training(batch, seq, model, hidden):
 values = list(product(dim1, dim4, dims, formatB, has_bias))
 names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
 def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
     inner = torch.randint(1, 128, size=(1,)).item()

From e32d2770bcfb2ad2e02d65d1ba81bb7bb4287799 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:26:42 +0000
Subject: [PATCH 086/234] Enable matmullt tests

---
 tests/test_autograd.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 4c7e2b9df..7c28dc436 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -288,7 +288,6 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
 )
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize(
     "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias",
     values,

From 8206bd18cedc7555d5dd656db28c510d39b408ae Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:27:15 +0000
Subject: [PATCH 087/234] Enabled linear_serialization tests

---
 tests/test_linear8bitlt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 6d5fc6a82..b75fa4efd 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -68,7 +68,6 @@ def test_linear_no_igemmlt():
     assert linear_custom.state.CxB is None
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt",
                          list(product([False, True], [False, True], [False, True], [False, True])))
 def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt):

From 973a9f8c882612bb13935d39cf5bbfb21f17e907 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:28:57 +0000
Subject: [PATCH 088/234] fix error with dequant_mm change

---
 csrc/ops.hip | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index aa16e9c3f..cb0acf851 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -666,6 +666,7 @@ void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out,
   int threads = 512;
   //int tileCols = fill_up_to_nearest_multiple(numCols, 32);
   //int n = numRows*tileCols;
+  int tileCols = numCols;
   int n = numRows*numCols;
   //int subtile_rows = 128;
   //int tilesize = 32*subtile_rows;

From 387a9b79659b7ca999a12d42b366629fcdc11079 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:36:15 +0000
Subject: [PATCH 089/234] Enable extract outliers test

---
 tests/test_functional.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 6d7ace64b..01a4f3f77 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2040,7 +2040,6 @@ def quant_zp(x):
     print(err1, err2, err3, err4, err5, err6)
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_extract_outliers():
     for i in range(k):
         shapeA = (4096, 4096 * 4)

From 93dfb51a012786e33c737542336054f3fa2174ee Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:38:14 +0000
Subject: [PATCH 090/234] Enable test overflow

---
 tests/test_functional.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 01a4f3f77..291a2ea24 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1356,7 +1356,6 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)

From 90bbdc609291ebf3a263134f9091be191214f0be Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 12 Mar 2024 18:47:57 +0000
Subject: [PATCH 091/234] Skip overflow and linear serialization for now

---
 tests/test_functional.py   | 1 +
 tests/test_linear8bitlt.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 291a2ea24..01a4f3f77 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1356,6 +1356,7 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     for vals in values
 ]
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index b75fa4efd..6d5fc6a82 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -68,6 +68,7 @@ def test_linear_no_igemmlt():
     assert linear_custom.state.CxB is None
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt",
                          list(product([False, True], [False, True], [False, True], [False, True])))
 def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt):

From 3dc14e8575636df0b43370a3030118beb646b7a1 Mon Sep 17 00:00:00 2001
From: "U-AMD\\zhaoyili" <zhaoyili@AUS-L1-ZHAOYILI.amd.com>
Date: Mon, 18 Mar 2024 15:36:31 -0500
Subject: [PATCH 092/234] improve the gemv 4bit accuracy by forcing the hipcub
 to 32

---
 csrc/kernels.hip         | 4 ++--
 tests/test_functional.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 723504fa8..54b6afb9d 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -500,7 +500,7 @@ __global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index
 template<typename T, int BLOCK_SIZE, int NUM_MAX>
 __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* out_idx, const int n)
 {
-  typedef hipcub::WarpReduce<T> WarpReduce;
+  typedef hipcub::WarpReduce<T, 32> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage;
   typedef hipcub::BlockLoad<T, BLOCK_SIZE/8 , 8, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
   __shared__ typename LoadT::TempStorage loadt;
@@ -3553,7 +3553,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1x32 * 32x4 -> 1x4 outputs per thread block
-  typedef hipcub::WarpReduce<float> WarpReduce;
+  typedef hipcub::WarpReduce<float, 32> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage[THREADS/32];
 
   const int warp_idx = threadIdx.x / 32;
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4591bd85c..7d8df2e48 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2543,7 +2543,6 @@ def test_managed():
 @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4'])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True'])
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))

From 485ba8f878ad4d256af4c1f1c34bbe5f9918fb56 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 19 Mar 2024 21:21:53 +0000
Subject: [PATCH 093/234] Update skip comment

---
 tests/test_functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 7d8df2e48..5dba4ef5f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2375,7 +2375,7 @@ def test_normal_map_tree():
 @pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'], ids=['fc1', 'fc2', 'attn', 'attn_packed'])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
 @pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=['uint8', 'fp16', 'bf16', 'fp32'])
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64")
 def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
     for dim in [128, 256, 512, 1024]:
     #for dim in [4*1024]:

From adfb5e20d57aaaba5cda7c94d7f24f0f77f4a5f1 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 27 Mar 2024 20:21:48 -0700
Subject: [PATCH 094/234] clean up device setup

---
 bitsandbytes/device_setup/__init__.py      |   0
 bitsandbytes/device_setup/cuda/__init__.py |   0
 bitsandbytes/device_setup/cuda/env_vars.py |  53 ---
 bitsandbytes/device_setup/cuda/main.py     | 393 ---------------------
 4 files changed, 446 deletions(-)
 delete mode 100644 bitsandbytes/device_setup/__init__.py
 delete mode 100644 bitsandbytes/device_setup/cuda/__init__.py
 delete mode 100644 bitsandbytes/device_setup/cuda/env_vars.py
 delete mode 100644 bitsandbytes/device_setup/cuda/main.py

diff --git a/bitsandbytes/device_setup/__init__.py b/bitsandbytes/device_setup/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/bitsandbytes/device_setup/cuda/__init__.py b/bitsandbytes/device_setup/cuda/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/bitsandbytes/device_setup/cuda/env_vars.py b/bitsandbytes/device_setup/cuda/env_vars.py
deleted file mode 100644
index 4b2549653..000000000
--- a/bitsandbytes/device_setup/cuda/env_vars.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-from typing import Dict
-
-
-def to_be_ignored(env_var: str, value: str) -> bool:
-    ignorable = {
-        "PWD",  # PWD: this is how the shell keeps track of the current working dir
-        "OLDPWD",
-        "SSH_AUTH_SOCK",  # SSH stuff, therefore unrelated
-        "SSH_TTY",
-        "GOOGLE_VM_CONFIG_LOCK_FILE",  # GCP: requires elevated permissions, causing problems in VMs and Jupyter notebooks
-        "HOME",  # Linux shell default
-        "TMUX",  # Terminal Multiplexer
-        "XDG_DATA_DIRS",  # XDG: Desktop environment stuff
-        "XDG_GREETER_DATA_DIR",  # XDG: Desktop environment stuff
-        "XDG_RUNTIME_DIR",
-        "MAIL",  # something related to emails
-        "SHELL",  # binary for currently invoked shell
-        "DBUS_SESSION_BUS_ADDRESS",  # hardware related
-        "PATH",  # this is for finding binaries, not libraries
-        "LESSOPEN",  # related to the `less` command
-        "LESSCLOSE",
-        "_",  # current Python interpreter
-    }
-    return env_var in ignorable
-
-
-def might_contain_a_path(candidate: str) -> bool:
-    return os.sep in candidate
-
-
-def is_active_conda_env(env_var: str) -> bool:
-    return "CONDA_PREFIX" == env_var
-
-
-def is_other_conda_env_var(env_var: str) -> bool:
-    return "CONDA" in env_var
-
-
-def is_relevant_candidate_env_var(env_var: str, value: str) -> bool:
-    return is_active_conda_env(env_var) or (
-        might_contain_a_path(value) and not
-        is_other_conda_env_var(env_var) and not
-        to_be_ignored(env_var, value)
-    )
-
-
-def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]:
-    return {
-        env_var: value
-        for env_var, value in os.environ.items()
-        if is_relevant_candidate_env_var(env_var, value)
-    }
diff --git a/bitsandbytes/device_setup/cuda/main.py b/bitsandbytes/device_setup/cuda/main.py
deleted file mode 100644
index 36224d2f9..000000000
--- a/bitsandbytes/device_setup/cuda/main.py
+++ /dev/null
@@ -1,393 +0,0 @@
-"""
-extract factors the build is dependent on:
-[X] compute capability
-    [ ] TODO: Q - What if we have multiple GPUs of different makes?
-- CUDA version
-- Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
-    - CuBLAS-LT: full-build 8-bit optimizer
-    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
-
-evaluation:
-    - if paths faulty, return meaningful error
-    - else:
-        - determine CUDA version
-        - determine capabilities
-        - based on that set the default path
-"""
-
-import ctypes as ct
-import errno
-import os
-from pathlib import Path
-import platform
-from typing import Set, Union
-from warnings import warn
-
-import torch
-
-from .env_vars import get_potentially_lib_path_containing_env_vars
-
-DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so")
-if platform.system() == "Windows":  # Windows
-    CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
-else:  # Linux or other
-    # these are the most common libs names
-    # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
-    # we have libcudart.so.11.0 which causes a lot of errors before
-    # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
-    CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"]
-
-
-class CUDASetup:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def generate_instructions(self):
-        if getattr(self, 'error', False): return
-        print(self.error)
-        self.error = True
-        if not self.cuda_available:
-            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed.')
-            self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.')
-            self.add_log_entry('CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null')
-            self.add_log_entry('CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a')
-            self.add_log_entry('CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc')
-            self.add_log_entry('CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)')
-            return
-
-        if self.cudart_path is None:
-            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.')
-            self.add_log_entry('CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable')
-            self.add_log_entry('CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null')
-            self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
-            self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
-            self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
-            self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
-            self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
-
-            return
-
-        make_cmd = f'CUDA_VERSION={self.cuda_version_string}'
-        if len(self.cuda_version_string) < 3:
-            make_cmd += ' make cuda92'
-        elif self.cuda_version_string == '110':
-            make_cmd += ' make cuda110'
-        elif self.cuda_version_string[:2] == '11' and int(self.cuda_version_string[2]) > 0:
-            make_cmd += ' make cuda11x'
-        elif self.cuda_version_string[:2] == '12' and 1 >= int(self.cuda_version_string[2]) >= 0:
-            make_cmd += ' make cuda12x'
-        elif self.cuda_version_string == '100':
-            self.add_log_entry('CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.')
-            self.add_log_entry('CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables.')
-            return
-
-
-        has_cublaslt = is_cublasLt_compatible(self.cc)
-        if not has_cublaslt:
-            make_cmd += '_nomatmul'
-
-        self.add_log_entry('CUDA SETUP: Something unexpected happened. Please compile from source:')
-        self.add_log_entry('git clone https://github.com/TimDettmers/bitsandbytes.git')
-        self.add_log_entry('cd bitsandbytes')
-        self.add_log_entry(make_cmd)
-        self.add_log_entry('python setup.py install')
-
-    def initialize(self):
-        if not getattr(self, 'initialized', False):
-            self.has_printed = False
-            self.lib = None
-            self.initialized = False
-            self.error = False
-
-    def manual_override(self):
-        if not torch.cuda.is_available():
-            return
-        override_value = os.environ.get('BNB_CUDA_VERSION')
-        if not override_value:
-            return
-
-        binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".")
-        # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`;
-        # let's remove any trailing numbers:
-        binary_name_stem = binary_name_stem.rstrip("0123456789")
-        # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`;
-        # let's tack the new version number and the original extension back on.
-        self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}"
-
-        warn(
-            f'\n\n{"=" * 80}\n'
-            'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
-            'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
-            'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
-            'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
-            'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
-            f'Loading: {self.binary_name}'
-            f'\n{"=" * 80}\n\n'
-        )
-
-    def run_cuda_setup(self):
-        self.initialized = True
-        self.cuda_setup_log = []
-
-        binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup()
-        self.cudart_path = cudart_path
-        self.cuda_available = torch.cuda.is_available()
-        self.cc = cc
-        self.cuda_version_string = cuda_version_string
-        self.binary_name = binary_name
-        self.manual_override()
-
-        package_dir = Path(__file__).parent.parent.parent
-        binary_path = package_dir / self.binary_name
-
-        try:
-            if not binary_path.exists():
-                self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
-                legacy_binary_name = f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}"
-                self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
-                binary_path = package_dir / legacy_binary_name
-                if not binary_path.exists() or torch.cuda.is_available():
-                    self.add_log_entry('')
-                    self.add_log_entry('='*48 + 'ERROR' + '='*37)
-                    self.add_log_entry('CUDA SETUP: CUDA detection failed! Possible reasons:')
-                    self.add_log_entry('1. You need to manually override the PyTorch CUDA version. Please see: '
-                             '"https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md')
-                    self.add_log_entry('2. CUDA driver not installed')
-                    self.add_log_entry('3. CUDA not installed')
-                    self.add_log_entry('4. You have multiple conflicting CUDA libraries')
-                    self.add_log_entry('5. Required library not pre-compiled for this bitsandbytes release!')
-                    self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
-                    self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.')
-                    self.add_log_entry('='*80)
-                    self.add_log_entry('')
-                    self.generate_instructions()
-                    raise Exception('CUDA SETUP: Setup Failed!')
-                self.lib = ct.cdll.LoadLibrary(str(binary_path))
-            else:
-                self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path!s}...")
-                self.lib = ct.cdll.LoadLibrary(str(binary_path))
-        except Exception as ex:
-            self.add_log_entry(str(ex))
-
-    def add_log_entry(self, msg, is_warning=False):
-        self.cuda_setup_log.append((msg, is_warning))
-
-    def print_log_stack(self):
-        for msg, is_warning in self.cuda_setup_log:
-            if is_warning:
-                warn(msg)
-            else:
-                print(msg)
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-
-def is_cublasLt_compatible(cc):
-    has_cublaslt = False
-    if cc is not None:
-        cc_major, cc_minor = cc.split('.')
-        if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
-            CUDASetup.get_instance().add_log_entry("WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! \
-                    If you run into issues with 8-bit matmul, you can try 4-bit quantization: https://huggingface.co/blog/4bit-transformers-bitsandbytes", is_warning=True)
-        else:
-            has_cublaslt = True
-    return has_cublaslt
-
-def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
-    return {Path(ld_path) for ld_path in paths_list_candidate.split(os.pathsep) if ld_path}
-
-
-def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
-    existent_directories: Set[Path] = set()
-    for path in candidate_paths:
-        try:
-            if path.exists():
-                existent_directories.add(path)
-        except PermissionError:
-            # Handle the PermissionError first as it is a subtype of OSError
-            # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
-            pass
-        except OSError as exc:
-            if exc.errno != errno.ENAMETOOLONG:
-                raise exc
-
-    non_existent_directories: Set[Path] = candidate_paths - existent_directories
-    if non_existent_directories:
-        CUDASetup.get_instance().add_log_entry(
-            f"The following directories listed in your path were found to be non-existent: {non_existent_directories}",
-            is_warning=False,
-        )
-
-    return existent_directories
-
-
-def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
-    paths = set()
-    for libname in CUDA_RUNTIME_LIBS:
-        for path in candidate_paths:
-            try:
-                if (path / libname).is_file():
-                    paths.add(path / libname)
-            except PermissionError:
-                pass
-    return paths
-
-
-def resolve_paths_list(paths_list_candidate: str) -> Set[Path]:
-    """
-    Searches a given environmental var for the CUDA runtime library,
-    i.e. `libcudart.so`.
-    """
-    return remove_non_existent_dirs(extract_candidate_paths(paths_list_candidate))
-
-
-def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]:
-    return get_cuda_runtime_lib_paths(
-        resolve_paths_list(paths_list_candidate)
-    )
-
-
-def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
-    if len(results_paths) > 1:
-        warning_msg = (
-            f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
-            "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
-            "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
-            "For example, if you want to use the CUDA version 122"
-            "BNB_CUDA_VERSION=122 python ..."
-            "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122"
-            "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g."
-            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2")
-        CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)
-
-
-def determine_cuda_runtime_lib_path() -> Union[Path, None]:
-    """
-        Searches for a cuda installations, in the following order of priority:
-            1. active conda env
-            2. LD_LIBRARY_PATH
-            3. any other env vars, while ignoring those that
-                - are known to be unrelated (see `bnb.device_setup.cuda.env_vars.to_be_ignored`)
-                - don't contain the path separator `/`
-
-        If multiple libraries are found in part 3, we optimistically try one,
-        while giving a warning message.
-    """
-    candidate_env_vars = get_potentially_lib_path_containing_env_vars()
-
-    cuda_runtime_libs = set()
-    if "CONDA_PREFIX" in candidate_env_vars:
-        conda_libs_path = Path(candidate_env_vars["CONDA_PREFIX"]) / "lib"
-
-        conda_cuda_libs = find_cuda_lib_in(str(conda_libs_path))
-        warn_in_case_of_duplicates(conda_cuda_libs)
-
-        if conda_cuda_libs:
-            cuda_runtime_libs.update(conda_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True)
-
-    if "LD_LIBRARY_PATH" in candidate_env_vars:
-        lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"])
-
-        if lib_ld_cuda_libs:
-            cuda_runtime_libs.update(lib_ld_cuda_libs)
-        warn_in_case_of_duplicates(lib_ld_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True)
-
-    remaining_candidate_env_vars = {
-        env_var: value for env_var, value in candidate_env_vars.items()
-        if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"}
-    }
-
-    cuda_runtime_libs = set()
-    for env_var, value in remaining_candidate_env_vars.items():
-        cuda_runtime_libs.update(find_cuda_lib_in(value))
-
-    if len(cuda_runtime_libs) == 0:
-        CUDASetup.get_instance().add_log_entry('CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...')
-        cuda_runtime_libs.update(find_cuda_lib_in('/usr/local/cuda/lib64'))
-
-    warn_in_case_of_duplicates(cuda_runtime_libs)
-
-    cuda_setup = CUDASetup.get_instance()
-    cuda_setup.add_log_entry(f'DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}')
-
-    return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None
-
-
-# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
-def get_cuda_version():
-    major, minor = map(int, torch.version.cuda.split("."))
-
-    if major < 11:
-        CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
-
-    return f'{major}{minor}'
-
-def get_compute_capabilities():
-    ccs = []
-    for i in range(torch.cuda.device_count()):
-        cc_major, cc_minor = torch.cuda.get_device_capability(torch.cuda.device(i))
-        ccs.append(f"{cc_major}.{cc_minor}")
-
-    ccs.sort(key=lambda v: tuple(map(int, str(v).split("."))))
-
-    return ccs
-
-
-def evaluate_cuda_setup():
-    cuda_setup = CUDASetup.get_instance()
-    if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
-        cuda_setup.add_log_entry('')
-        cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
-        cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
-              ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
-        cuda_setup.add_log_entry('='*80)
-
-    if not torch.cuda.is_available():
-        return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None
-
-    cudart_path = determine_cuda_runtime_lib_path()
-    cc = get_compute_capabilities()[-1]  # we take the highest capability
-    cuda_version_string = get_cuda_version()
-
-    cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.")
-    cuda_setup.add_log_entry(
-        "CUDA SETUP: To manually override the PyTorch CUDA version please see:"
-        "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md"
-    )
-
-
-    # 7.5 is the minimum CC vor cublaslt
-    has_cublaslt = is_cublasLt_compatible(cc)
-
-    # TODO:
-    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
-    # (2) Multiple CUDA versions installed
-
-    # we use ls -l instead of nvcc to determine the cuda version
-    # since most installations will have the libcudart.so installed, but not the compiler
-
-    binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
-    if not has_cublaslt:
-        # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
-        binary_name += "_nocublaslt"
-
-    binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}"
-
-    return binary_name, cudart_path, cc, cuda_version_string

From 6f08879a2bf2d094f75013a3d0e791c662604240 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Thu, 28 Mar 2024 11:25:13 +0800
Subject: [PATCH 095/234] clean

---
 bitsandbytes/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index ec25eb0bc..0229e59e2 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -1,7 +1,7 @@
 import json
 import shlex
 import subprocess
-from typing import Any, Dict, Tuple
+from typing import Tuple
 
 import torch
 

From a9e454885a6e6999bfe6960bd40f7abf843da6bd Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 27 Mar 2024 20:28:30 -0700
Subject: [PATCH 096/234] fix utils

---
 bitsandbytes/functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 8a0c7dbae..a80e56011 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -6,14 +6,14 @@
 from functools import reduce  # Required in Python 3
 import itertools
 import operator
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 import torch
 from torch import Tensor
 
 from bitsandbytes.backends import backends, ensure_backend_is_available
-
+from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 from .cextension import lib
 
 

From 84f67d260ca7bc59113419dcdebc6ea729f29129 Mon Sep 17 00:00:00 2001
From: jianan-gu <jianan.gu@intel.com>
Date: Wed, 27 Mar 2024 20:35:28 -0700
Subject: [PATCH 097/234] link QuantState in F.

---
 bitsandbytes/functional.py | 182 +------------------------------------
 bitsandbytes/utils.py      | 177 +++++++++++++++++++++++++++++++++++-
 2 files changed, 180 insertions(+), 179 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index a80e56011..38459981b 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -6,14 +6,14 @@
 from functools import reduce  # Required in Python 3
 import itertools
 import operator
-from typing import Any, Dict, Optional, Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
 from torch import Tensor
 
 from bitsandbytes.backends import backends, ensure_backend_is_available
-from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
+from bitsandbytes.utils import QuantState
 from .cextension import lib
 
 
@@ -617,182 +617,8 @@ def estimate_quantiles(
 
     return out
 
-
-class QuantState:
-    """container for quantization state components to work with Params4bit and similar classes"""
-
-    valid_quant_types = ("fp4", "nf4")
-    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
-    valid_qs_keys = [
-        "absmax",
-        "quant_map",
-        "nested_absmax",
-        "nested_quant_map",
-        "quant_state",
-        "quant_type",
-        "blocksize",
-        "dtype",
-        "shape",
-        "nested_blocksize",
-        "nested_dtype",
-        "nested_offset",
-    ]
-
-    def __init__(
-        self,
-        absmax,
-        shape=None,
-        code=None,
-        blocksize=None,
-        quant_type=None,
-        dtype=None,
-        offset=None,
-        state2=None,
-    ):
-        self.absmax = absmax
-        self.shape = shape
-        self.code = code
-        self.dtype = dtype
-        self.blocksize = blocksize
-        self.quant_type = quant_type
-        self.offset = offset
-        self.state2 = state2
-        self.nested = state2 is not None
-
-    def __get_item__(self, idx):
-        """
-        ensures compatibility with older quant state scheme with nested lists.
-        assumes the following layout:
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-        """
-        if self.nested:
-            list_repr = [
-                self.absmax,
-                self.shape,
-                self.dtype,
-                self.blocksize,
-                [self.offset, self.state2],
-                self.quant_type,
-            ]
-        else:
-            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
-        return list_repr[idx]
-
-    @classmethod
-    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> "QuantState":
-        """
-        unpacks components of state_dict into QuantState
-        where necessary, convert into strings, torch.dtype, ints, etc.
-
-        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
-
-        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
-        """
-
-        # unpacking tensor with non-tensor components
-        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
-        if not len(qs_key) and "quant_type" not in qs_dict:
-            raise ValueError("Expected packed or unpacked quant_state items, found neither")
-        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
-            raise ValueError(
-                f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.",
-            )
-
-        # unpacking minor and non-tensor quant state items if necessary
-        if len(qs_key) == 1:
-            first_qs_key = qs_key[0]
-            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key)))
-
-        qs_dict = {k.split(".")[-1]: v for k, v in qs_dict.items()}  # strip prefixes
-        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
-
-        if "nested_absmax" in qs_dict:
-            offset = torch.tensor(float(qs_dict["nested_offset"])).to(device)
-            state2 = cls(
-                absmax=qs_dict["nested_absmax"].to(device),
-                blocksize=qs_dict["nested_blocksize"],
-                code=qs_dict["nested_quant_map"].to(device),
-                dtype=getattr(torch, qs_dict["nested_dtype"]),
-            )
-        else:
-            offset, state2 = None, None
-
-        quant_state = cls(
-            quant_type=qs_dict["quant_type"],
-            absmax=qs_dict["absmax"].to(device),
-            blocksize=qs_dict["blocksize"],
-            code=qs_dict["quant_map"].to(device),
-            dtype=getattr(torch, qs_dict["dtype"]),
-            shape=torch.Size(qs_dict["shape"]) if qs_dict["shape"] is not None else None,
-            offset=offset,
-            state2=state2,
-        )
-        return quant_state
-
-    def as_dict(self, packed=False):
-        """
-        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
-        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
-        """
-        qs_dict = {
-            "quant_type": self.quant_type,
-            "absmax": self.absmax,
-            "blocksize": self.blocksize,
-            "quant_map": self.code,
-            "dtype": str(self.dtype).strip("torch."),
-            "shape": tuple(self.shape),
-        }
-        if self.nested:
-            qs_dict.update(
-                {
-                    "nested_absmax": self.state2.absmax,
-                    "nested_blocksize": self.state2.blocksize,
-                    "nested_quant_map": self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
-                    "nested_dtype": str(self.state2.dtype).strip("torch."),
-                    "nested_offset": self.offset.item(),
-                },
-            )
-        if not packed:
-            return qs_dict
-
-        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
-        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
-        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
-        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
-        return qs_packed_dict
-
-    def to(self, device):
-        # make sure the quantization state is on the right device
-        self.absmax = self.absmax.to(device)
-        if self.nested:
-            self.offset = self.offset.to(device)
-            self.state2.absmax = self.state2.absmax.to(device)
-            self.state2.code = self.state2.code.to(device)
-
-    def __eq__(self, other):
-        if not isinstance(other, QuantState):
-            return False
-
-        return (
-            torch.allclose(self.absmax, other.absmax, atol=1e-6)
-            and self.shape == other.shape
-            and torch.allclose(self.code, other.code, atol=1e-6)
-            and self.dtype == other.dtype
-            and self.blocksize == other.blocksize
-            and self.quant_type == other.quant_type
-            and (
-                self.offset == other.offset
-                if self.offset is not None and other.offset is not None
-                else self.offset is other.offset
-            )
-            and (
-                self.state2 == other.state2
-                if self.state2 is not None and other.state2 is not None
-                else self.state2 is other.state2
-            )
-        )
-
+# maintain the compatibility as F.QuantState
+QuantState = QuantState
 
 def quantize_blockwise(
     A: Tensor,
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 0229e59e2..29a5cfea3 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -1,7 +1,7 @@
 import json
 import shlex
 import subprocess
-from typing import Tuple
+from typing import Any, Dict, Tuple
 
 import torch
 
@@ -198,3 +198,178 @@ def unpack_tensor_to_dict(tensor_data):
     unpacked_dict = json.loads(json_str)
 
     return unpacked_dict
+
+class QuantState:
+    """container for quantization state components to work with Params4bit and similar classes"""
+
+    valid_quant_types = ("fp4", "nf4")
+    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
+    valid_qs_keys = [
+        "absmax",
+        "quant_map",
+        "nested_absmax",
+        "nested_quant_map",
+        "quant_state",
+        "quant_type",
+        "blocksize",
+        "dtype",
+        "shape",
+        "nested_blocksize",
+        "nested_dtype",
+        "nested_offset",
+    ]
+
+    def __init__(
+        self,
+        absmax,
+        shape=None,
+        code=None,
+        blocksize=None,
+        quant_type=None,
+        dtype=None,
+        offset=None,
+        state2=None,
+    ):
+        self.absmax = absmax
+        self.shape = shape
+        self.code = code
+        self.dtype = dtype
+        self.blocksize = blocksize
+        self.quant_type = quant_type
+        self.offset = offset
+        self.state2 = state2
+        self.nested = state2 is not None
+
+    def __get_item__(self, idx):
+        """
+        ensures compatibility with older quant state scheme with nested lists.
+        assumes the following layout:
+        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
+        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
+        """
+        if self.nested:
+            list_repr = [
+                self.absmax,
+                self.shape,
+                self.dtype,
+                self.blocksize,
+                [self.offset, self.state2],
+                self.quant_type,
+            ]
+        else:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
+        return list_repr[idx]
+
+    @classmethod
+    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> "QuantState":
+        """
+        unpacks components of state_dict into QuantState
+        where necessary, convert into strings, torch.dtype, ints, etc.
+
+        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
+
+        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
+        """
+
+        # unpacking tensor with non-tensor components
+        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
+        if not len(qs_key) and "quant_type" not in qs_dict:
+            raise ValueError("Expected packed or unpacked quant_state items, found neither")
+        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
+            raise ValueError(
+                f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.",
+            )
+
+        # unpacking minor and non-tensor quant state items if necessary
+        if len(qs_key) == 1:
+            first_qs_key = qs_key[0]
+            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key)))
+
+        qs_dict = {k.split(".")[-1]: v for k, v in qs_dict.items()}  # strip prefixes
+        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
+
+        if "nested_absmax" in qs_dict:
+            offset = torch.tensor(float(qs_dict["nested_offset"])).to(device)
+            state2 = cls(
+                absmax=qs_dict["nested_absmax"].to(device),
+                blocksize=qs_dict["nested_blocksize"],
+                code=qs_dict["nested_quant_map"].to(device),
+                dtype=getattr(torch, qs_dict["nested_dtype"]),
+            )
+        else:
+            offset, state2 = None, None
+
+        quant_state = cls(
+            quant_type=qs_dict["quant_type"],
+            absmax=qs_dict["absmax"].to(device),
+            blocksize=qs_dict["blocksize"],
+            code=qs_dict["quant_map"].to(device),
+            dtype=getattr(torch, qs_dict["dtype"]),
+            shape=torch.Size(qs_dict["shape"]) if qs_dict["shape"] is not None else None,
+            offset=offset,
+            state2=state2,
+        )
+        return quant_state
+
+    def as_dict(self, packed=False):
+        """
+        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
+        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
+        """
+        qs_dict = {
+            "quant_type": self.quant_type,
+            "absmax": self.absmax,
+            "blocksize": self.blocksize,
+            "quant_map": self.code,
+            "dtype": str(self.dtype).strip("torch."),
+            "shape": tuple(self.shape),
+        }
+        if self.nested:
+            qs_dict.update(
+                {
+                    "nested_absmax": self.state2.absmax,
+                    "nested_blocksize": self.state2.blocksize,
+                    "nested_quant_map": self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
+                    "nested_dtype": str(self.state2.dtype).strip("torch."),
+                    "nested_offset": self.offset.item(),
+                },
+            )
+        if not packed:
+            return qs_dict
+
+        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
+        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
+        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
+        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
+        return qs_packed_dict
+
+    def to(self, device):
+        # make sure the quantization state is on the right device
+        self.absmax = self.absmax.to(device)
+        if self.nested:
+            self.offset = self.offset.to(device)
+            self.state2.absmax = self.state2.absmax.to(device)
+            self.state2.code = self.state2.code.to(device)
+
+    def __eq__(self, other):
+        if not isinstance(other, QuantState):
+            return False
+
+        return (
+            torch.allclose(self.absmax, other.absmax, atol=1e-6)
+            and self.shape == other.shape
+            and torch.allclose(self.code, other.code, atol=1e-6)
+            and self.dtype == other.dtype
+            and self.blocksize == other.blocksize
+            and self.quant_type == other.quant_type
+            and (
+                self.offset == other.offset
+                if self.offset is not None and other.offset is not None
+                else self.offset is other.offset
+            )
+            and (
+                self.state2 == other.state2
+                if self.state2 is not None and other.state2 is not None
+                else self.state2 is other.state2
+            )
+        )

From 9ff6c638ef5bb1e07e2d00c5327e018d5d05e2f1 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 3 Apr 2024 09:58:31 +0000
Subject: [PATCH 098/234] pre-commit run --all-files

---
 bitsandbytes/__init__.py          |   3 +-
 bitsandbytes/backends/__init__.py |   2 +
 bitsandbytes/backends/cuda.py     | 245 ++++++++++++++++++------------
 bitsandbytes/functional.py        |  50 +++---
 bitsandbytes/utils.py             |   1 +
 install_cuda.py                   |   8 +-
 6 files changed, 187 insertions(+), 122 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index c3d6a19e7..019a4f6ab 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from . import research, utils
-from .cextension import lib
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
@@ -13,12 +12,14 @@
     matmul_cublas,
     mm_cublas,
 )
+from .cextension import lib
 from .nn import modules
 
 if lib and lib.compiled_with_cuda:
     from .backends import register_backend
     from .backends.cuda import CUDABackend
     from .optim import adam
+
     register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
index d35021b1e..30f08073a 100644
--- a/bitsandbytes/backends/__init__.py
+++ b/bitsandbytes/backends/__init__.py
@@ -4,9 +4,11 @@
 
 backends: Dict[str, Backend] = {}
 
+
 def register_backend(backend_name: str, backend_instance: Backend):
     backends[backend_name.lower()] = backend_instance
 
+
 def ensure_backend_is_available(device_type: str):
     """Check if a backend is available for the given device type."""
     if device_type.lower() not in backends:
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 4fa1946e9..c76bcaebd 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -25,9 +25,7 @@
 
 
 class CUDABackend(Backend):
-    def double_quant(
-        self, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-    ):
+    def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
         device = A.device
         assert A.dtype == torch.half
         assert device.type == "cuda"
@@ -40,9 +38,7 @@ def double_quant(
             rows = A.shape[0]
 
         if row_stats is None or col_stats is None:
-            row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
-                A, threshold=threshold
-            )
+            row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold)
 
         if out_col is None:
             out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
@@ -60,9 +56,7 @@ def double_quant(
         if threshold > 0.0:
             nnz = nnz_row_ptr[-1].item()
             if nnz > 0:
-                coo_tensor = coo_zeros(
-                    A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
-                )
+                coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device)
                 ptrRowIdx = get_ptr(coo_tensor.rowidx)
                 ptrColIdx = get_ptr(coo_tensor.colidx)
                 ptrVal = get_ptr(coo_tensor.values)
@@ -120,7 +114,7 @@ def double_quant(
 
         return out_row, out_col, row_stats, col_stats, coo_tensor
 
-    def transform(self, A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    def transform(self, A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
         prev_device = pre_call(A.device)
         if state is None:
             state = (A.shape, from_order)
@@ -130,7 +124,7 @@ def transform(self, A, to_order, from_order='row', out=None, transpose=False, st
         if out is None:
             out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
         else:
-            new_state = (state[0], to_order) # (shape, order)
+            new_state = (state[0], to_order)  # (shape, order)
 
         shape = state[0]
         if len(shape) == 2:
@@ -141,7 +135,7 @@ def transform(self, A, to_order, from_order='row', out=None, transpose=False, st
             dim2 = ct.c_int32(shape[2])
 
         is_on_gpu([A, out])
-        if to_order == 'col32':
+        if to_order == "col32":
             if transpose:
                 lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
             else:
@@ -166,7 +160,7 @@ def transform(self, A, to_order, from_order='row', out=None, transpose=False, st
                 lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
 
         else:
-            raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
+            raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}")
 
         post_call(prev_device)
 
@@ -178,14 +172,14 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         dimsA = len(shapeA)
         dimsB = len(shapeB)
 
-        assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+        assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
         if dimsA == 2:
             m = shapeA[0]
         elif dimsA == 3:
             m = shapeA[0] * shapeA[1]
 
         rows = n = shapeB[0]
-        assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
+        assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}"
 
         # if the tensor is empty, return a transformed empty tensor with the right dimensions
         if shapeA[0] == 0 and dimsA == 2:
@@ -194,13 +188,9 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
             return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
         if dimsA == 2 and out is None:
-            out, Sout = get_transform_buffer(
-                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-            )
+            out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
         elif dimsA == 3 and out is None:
-            out, Sout = get_transform_buffer(
-                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-            )
+            out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
 
         assert dimsB != 3, "len(B.shape)==3 not supported"
         assert A.device.type == "cuda"
@@ -244,50 +234,37 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
 
-        if formatB == 'col_turing':
+        if formatB == "col_turing":
             if dtype == torch.int32:
-                has_error = lib.cigemmlt_turing_32(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
+                has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:
-                has_error = lib.cigemmlt_turing_8(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
+                has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
         elif formatB == "col_ampere":
             if dtype == torch.int32:
-                has_error = lib.cigemmlt_ampere_32(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
+                has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:
-                has_error = lib.cigemmlt_ampere_8(
-                    ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-                )
+                has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
         if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
             raise NotImplementedError("igemmlt not available (probably built with NO_CUBLASLT)")
 
         if has_error:
-            print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-            raise Exception('cublasLt ran into an error!')
+            print(
+                f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}"
+            )
+            raise Exception("cublasLt ran into an error!")
 
         torch.cuda.set_device(prev_device)
 
         return out, Sout
 
     def mm_dequant(
-        self,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None
+        self, A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None
     ):
         assert A.dtype == torch.int32
-        if bias is not None: assert bias.dtype == torch.float16
+        if bias is not None:
+            assert bias.dtype == torch.float16
         out_shape = quant_state[0]
         if len(out_shape) == 3:
             out_shape = (out_shape[0] * out_shape[1], out_shape[2])
@@ -295,19 +272,11 @@ def mm_dequant(
         if out is None:
             out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
         if new_row_stats is None:
-            new_row_stats = torch.empty(
-                out_shape[0], dtype=torch.float32, device=A.device
-            )
+            new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device)
         if new_col_stats is None:
-            new_col_stats = torch.empty(
-                out_shape[1], dtype=torch.float32, device=A.device
-            )
-        assert (
-            new_row_stats.shape[0] == row_stats.shape[0]
-        ), f"{new_row_stats.shape} vs {row_stats.shape}"
-        assert (
-            new_col_stats.shape[0] == col_stats.shape[0]
-        ), f"{new_col_stats.shape} vs {col_stats.shape}"
+            new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device)
+        assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}"
+        assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}"
 
         prev_device = pre_call(A.device)
         ptrA = get_ptr(A)
@@ -321,7 +290,9 @@ def mm_dequant(
         numCols = ct.c_int32(out_shape[1])
 
         is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-        lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
+        lib.cdequant_mm_int32_fp16(
+            ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols
+        )
         post_call(prev_device)
 
         return out
@@ -332,9 +303,7 @@ def extract_outliers(self, A, SA, idx):
         assert formatA in ["col_turing", "col_ampere"]
         assert A.device.type == "cuda"
 
-        out = torch.zeros(
-            (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
-        )
+        out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
 
         idx_size = ct.c_int32(idx.numel())
         rows = ct.c_int32(shapeA[0])
@@ -345,7 +314,7 @@ def extract_outliers(self, A, SA, idx):
 
         prev_device = pre_call(A.device)
 
-        if formatA == 'col_turing':
+        if formatA == "col_turing":
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
@@ -361,13 +330,13 @@ def quantize_4bit(
         out: Optional[torch.Tensor] = None,
         blocksize=64,
         compress_statistics=False,
-        quant_type='fp4',
+        quant_type="fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        if A.device.type != 'cuda':
-            raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
-        if quant_type not in ['fp4', 'nf4']:
-            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+        if A.device.type != "cuda":
+            raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
+        if quant_type not in ["fp4", "nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
 
         n = A.numel()
         input_shape = A.shape
@@ -379,7 +348,7 @@ def quantize_4bit(
 
         if out is None:
             mod = dtype2bytes[quant_storage] * 2
-            out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
+            out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
 
         assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
 
@@ -387,22 +356,34 @@ def quantize_4bit(
         is_on_gpu([A, out, absmax])
 
         if A.dtype == torch.float32:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_fp32_fp4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
             else:
-                lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+                lib.cquantize_blockwise_fp32_nf4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
 
         elif A.dtype == torch.float16:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_fp16_fp4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
             else:
-                lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+                lib.cquantize_blockwise_fp16_nf4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
 
         elif A.dtype == torch.bfloat16:
-            if quant_type == 'fp4':
-                lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            if quant_type == "fp4":
+                lib.cquantize_blockwise_bf16_fp4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
             else:
-                lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+                lib.cquantize_blockwise_bf16_nf4(
+                    get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+                )
 
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
@@ -416,31 +397,55 @@ def quantize_4bit(
             absmax -= offset
             qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
             del absmax
-            state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
+            state = QuantState(
+                absmax=qabsmax,
+                shape=input_shape,
+                dtype=A.dtype,
+                blocksize=blocksize,
+                code=code,
+                quant_type=quant_type,
+                offset=offset,
+                state2=state2,
+            )
 
         else:
-            state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type)
+            state = QuantState(
+                absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type
+            )
 
         return out, state
 
-    def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> torch.Tensor:
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> torch.Tensor:
         if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+            raise ValueError(
+                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+            )
 
-        if quant_type not in ['fp4', 'nf4']:
-            raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+        if quant_type not in ["fp4", "nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
 
         if quant_state is None:
             assert absmax is not None and out is not None
 
-            quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
+            quant_state = QuantState(
+                absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type
+            )
         else:
             absmax = quant_state.absmax
 
         if quant_state.nested:
             absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
             absmax += quant_state.offset
-            if absmax.dtype != torch.float32: absmax = absmax.float()
+            if absmax.dtype != torch.float32:
+                absmax = absmax.float()
 
         if out is None:
             out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
@@ -451,29 +456,73 @@ def dequantize_4bit(self, A: torch.Tensor, quant_state: Optional[QuantState] = N
         is_on_gpu([A, absmax, out])
 
         if out.dtype == torch.float32:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_fp32_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
             else:
-                lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+                lib.cdequantize_blockwise_fp32_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
 
         elif out.dtype == torch.float16:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_fp16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
             else:
-                lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+                lib.cdequantize_blockwise_fp16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
 
         elif out.dtype == torch.bfloat16:
-            if quant_state.quant_type == 'fp4':
-                lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_bf16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
             else:
-                lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
+                lib.cdequantize_blockwise_bf16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                )
 
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
 
         post_call(A.device)
 
-        is_transposed = (True if A.shape[0] == 1 else False)
+        is_transposed = True if A.shape[0] == 1 else False
 
-        if is_transposed: return out.t()
-        else: return out
+        if is_transposed:
+            return out.t()
+        else:
+            return out
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 38459981b..6bb02944d 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -14,6 +14,7 @@
 
 from bitsandbytes.backends import backends, ensure_backend_is_available
 from bitsandbytes.utils import QuantState
+
 from .cextension import lib
 
 
@@ -617,9 +618,11 @@ def estimate_quantiles(
 
     return out
 
+
 # maintain the compatibility as F.QuantState
 QuantState = QuantState
 
+
 def quantize_blockwise(
     A: Tensor,
     code: Optional[torch.Tensor] = None,
@@ -977,7 +980,15 @@ def quantize_4bit(
         The quantization state to undo the quantization.
     """
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].quantize_4bit(A, absmax=absmax, out=out, blocksize=blocksize, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage)
+    return backends[A.device.type].quantize_4bit(
+        A,
+        absmax=absmax,
+        out=out,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+        quant_type=quant_type,
+        quant_storage=quant_storage,
+    )
 
 
 def dequantize_fp4(
@@ -1035,7 +1046,9 @@ def dequantize_4bit(
         Dequantized tensor.
     """
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].dequantize_4bit(A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type)
+    return backends[A.device.type].dequantize_4bit(
+        A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type
+    )
 
 
 def quantize(
@@ -1876,18 +1889,18 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     return backends[A.device.type].igemmlt(A, B, SA, SB, out=out, Sout=Sout, dtype=dtype)
 
 
-def mm_dequant(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None
-):
+def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None):
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].mm_dequant(A, quant_state, row_stats, col_stats, out=out, new_row_stats=new_row_stats, new_col_stats=new_col_stats, bias=bias)
+    return backends[A.device.type].mm_dequant(
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=out,
+        new_row_stats=new_row_stats,
+        new_col_stats=new_col_stats,
+        bias=bias,
+    )
 
 
 def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0):
@@ -2009,12 +2022,16 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
 
 def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].double_quant(A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold)
+    return backends[A.device.type].double_quant(
+        A, col_stats=col_stats, row_stats=row_stats, out_col=out_col, out_row=out_row, threshold=threshold
+    )
 
 
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
     ensure_backend_is_available(A.device.type)
-    return backends[A.device.type].transform(A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld)
+    return backends[A.device.type].transform(
+        A, to_order, from_order=from_order, out=out, transpose=transpose, state=state, ld=ld
+    )
 
 
 def spmm_coo(cooA, B, out=None):
@@ -2280,7 +2297,6 @@ def extract_outliers(A, SA, idx):
     return backends[A.device.type].extract_outliers(A, SA, idx)
 
 
-
 def pipeline_test(A, batch_size):
     out = torch.zeros_like(A)
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 29a5cfea3..92744dead 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -199,6 +199,7 @@ def unpack_tensor_to_dict(tensor_data):
 
     return unpacked_dict
 
+
 class QuantState:
     """container for quantization state components to work with Params4bit and similar classes"""
 
diff --git a/install_cuda.py b/install_cuda.py
index a5d09356d..cf7c8ee71 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -77,9 +77,7 @@ def main():
     download_path = "/tmp"  # default download path
 
     if len(sys.argv) < 2:
-        print(
-            "Usage: python install_cuda.py <version/all> [user/system] [download_path]"
-        )
+        print("Usage: python install_cuda.py <version/all> [user/system] [download_path]")
         sys.exit(1)
 
     version = sys.argv[1]
@@ -100,9 +98,7 @@ def main():
     elif version in cuda_versions:
         install_cuda(version, base_path, download_path)
     else:
-        print(
-            f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}"
-        )
+        print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}")
         sys.exit(1)
 
 

From a26722179c14bae51fb56cf66eda471341f4bf23 Mon Sep 17 00:00:00 2001
From: "U-AMD\\zhaoyili" <zhaoyili@AUS-L1-ZHAOYILI.amd.com>
Date: Mon, 8 Apr 2024 19:17:13 -0500
Subject: [PATCH 099/234] update instructions

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 882bd10de..d73713c87 100644
--- a/README.md
+++ b/README.md
@@ -23,23 +23,23 @@ You need to compile from source for ROCm.
 Compilation quickstart:
 ```bash
 # Run Docker
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --name=bnb_test --shm-size=8g --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --group-add video rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --name=bnb_test --shm-size=8g --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --group-add video rocm/pytorch:latest
 
 
 # Install Dependencies
-cd <workspace>
-git clone --recurse https://github.com/ROCmSoftwarePlatform/hipBLASLt
-cd hipBLASLt
-git checkout 4b3b34405e7e25cff404f69bfd0a832644430477
-./install.sh -idc
- 
-cd ..
-pip install einops lion_pytorch
+apt install hipblaslt
+pip install --upgrade pip
+pip install einops lion_pytorch accelerate
+pip install git+https://github.com/ROCm/transformers.git
 
 
 # Install BitsandBytes
 git clone --recurse https://github.com/ROCmSoftwarePlatform/bitsandbytes
 cd bitsandbytes
+# Checkout branch as needed
+# for general use - rocm_enabled
+# for rocm 5.7 - rocm5.7_internal_testing
+# for rocm 6.2 - rocm6.2_internal_testing
 git checkout rocm_enabled
 make hip
 python setup.py install

From ff3337148ea23642f1d2af9782854998bd132915 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 9 Apr 2024 20:37:54 +0000
Subject: [PATCH 100/234] Update README.md

---
 README.md | 140 ++++++++++--------------------------------------------
 1 file changed, 24 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index 81d1f40bb..9a741d22f 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,13 @@
-# bitsandbytes-rocm
+# `bitsandbytes`
 
-The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions.
-This fork is the ROCm adaptation of bitsandbytes. The repo is inspired by [agrocylo/bitsandbytes-rocm](https://github.com/agrocylo/bitsandbytes-rocm/tree/main/bitsandbytes), which is a ROCm version of bitsandbytes 0.37. This fork incorporates the majority of features from bitsandbytes 0.44, including the crucial 4 bit quantization feature.
-
-The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
-
-Resources:
-- [8-bit Optimizer Paper](https://arxiv.org/abs/2110.02861) --  [Video](https://www.youtube.com/watch?v=IxrlHAJtqKE) -- [Docs](https://bitsandbytes.readthedocs.io/en/latest/)
-
-- [LLM.int8() Paper](https://arxiv.org/abs/2208.07339) -- [LLM.int8() Software Blog Post](https://huggingface.co/blog/hf-bitsandbytes-integration) -- [LLM.int8() Emergent Features Blog Post](https://timdettmers.com/2022/08/17/llm-int8-and-emergent-features/)
-
-## TL;DR
-**Requirements**
-Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + ROCm >= 6.0 or CUDA > 10.0
+[![Downloads](https://static.pepy.tech/badge/bitsandbytes)](https://pepy.tech/project/bitsandbytes) [![Downloads](https://static.pepy.tech/badge/bitsandbytes/month)](https://pepy.tech/project/bitsandbytes) [![Downloads](https://static.pepy.tech/badge/bitsandbytes/week)](https://pepy.tech/project/bitsandbytes)
 
+The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 & 4-bit quantization functions.
 
-**Installation**:
-
-
-You need to compile from source for ROCm. 
-
-Compilation quickstart:
-```bash
-# Run Docker
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --name=bnb_test --shm-size=8g --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --group-add video rocm/pytorch:latest
+The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
-# Install BitsandBytes
+**Installation for ROCm:**
+To install latest bitsandbytes (supported on ROCm 6.2):
 git clone --recurse https://github.com/ROCm/bitsandbytes
 cd bitsandbytes
 git checkout rocm_enabled
@@ -34,101 +16,27 @@ cmake -DCOMPUTE_BACKEND=hip -S .
 make
 pip install .
 
+For ROCm specific versions:
+Install Dependencies:
+#hipblaslt installation needed only for rocm<6.0
+apt install hipblaslt
+pip install --upgrade pip
+pip install einops lion_pytorch accelerate
+pip install git+https://github.com/ROCm/transformers.git
 
-# Run this script to check if its installed successfully 
-python check_bnb_install.py
-```
-
-**Using Int8 inference with HuggingFace Transformers**
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf',
-  device_map='auto',
-  load_in_8bit=True,
-  max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')
-```
-
-A more detailed example, can be found in [examples/int8_inference_huggingface.py](examples/int8_inference_huggingface.py).
-
-**Using 8-bit optimizer**:
-1. Comment out optimizer: ``#torch.optim.Adam(....)``
-2. Add 8-bit optimizer of your choice ``bnb.optim.Adam8bit(....)`` (arguments stay the same)
-3. Replace embedding layer if necessary: ``torch.nn.Embedding(..) -> bnb.nn.Embedding(..)``
-
-
-**Using 8-bit Inference**:
-1. Comment out torch.nn.Linear: ``#linear = torch.nn.Linear(...)``
-2. Add bnb 8-bit linear light module: ``linear = bnb.nn.Linear8bitLt(...)`` (base arguments stay the same)
-3. There are two modes:
-   - Mixed 8-bit training with 16-bit main weights. Pass the argument ``has_fp16_weights=True`` (default)
-   - Int8 inference. Pass the argument ``has_fp16_weights=False``
-4. To use the full LLM.int8() method, use the ``threshold=k`` argument. We recommend ``k=6.0``.
-```python
-# LLM.int8()
-linear = bnb.nn.Linear8bitLt(dim1, dim2, bias=True, has_fp16_weights=False, threshold=6.0)
-# inputs need to be fp16
-out = linear(x.to(torch.float16))
-```
-
-
-## Features
-- 8-bit Matrix multiplication with mixed precision decomposition
-- LLM.int8() inference
-- 8-bit Optimizers: Adam, AdamW, RMSProp, LARS, LAMB, Lion (saves 75% memory)
-- Stable Embedding Layer: Improved stability through better initialization, and normalization
-- 8-bit quantization: Quantile, Linear, and Dynamic quantization
-- Fast quantile estimation: Up to 100x faster than other algorithms
-
-## Using bitsandbytes
-
-### Using Int8 Matrix Multiplication
-
-For straight Int8 matrix multiplication with mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter:
-```python
-bnb.matmul(..., threshold=6.0)
-```
-
-For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://huggingface.co/blog/hf-bitsandbytes-integration).
-
-### Using the 8-bit Optimizers
-
-With bitsandbytes 8-bit optimizers can be used by changing a single line of code in your codebase. For NLP models we recommend also to use the StableEmbedding layers (see below) which improves results and helps with stable 8-bit optimization.  To get started with 8-bit optimizers, it is sufficient to replace your old optimizer with the 8-bit optimizer in the following way:
-```python
-import bitsandbytes as bnb
-
-# adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer
-adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer
-adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=8) # equivalent
-
-
-torch.nn.Embedding(...) ->  bnb.nn.StableEmbedding(...) # recommended for NLP models
-```
-
-Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so:
-```python
-# parameter tensors with less than 16384 values are optimized in 32-bit
-# it is recommended to use multiplies of 4096
-adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
-```
-
-### Change Bits and other Hyperparameters for Individual Parameters
-
-If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details
-
-### Fairseq Users
-
-To use the Stable Embedding Layer, override the respective `build_embedding(...)` function of your model. Make sure to also use the `--no-scale-embedding` flag to disable scaling of the word embedding layer (nor replaced with layer norm). You can use the optimizers by replacing the optimizer in the respective file (`adam.py` etc.).
-
-## Release and Feature History
-
-For upcoming features and changes and full history see [Patch Notes](CHANGELOG.md).
+Install Bitsandbytes:
+git clone --recurse https://github.com/ROCm/bitsandbytes
+cd bitsandbytes
+# Checkout branch as needed
+# for rocm 5.7 - rocm5.7_internal_testing
+# for rocm 6.2 - rocm6.2_internal_testing
+git checkout <branch>
+make hip
+python setup.py install
 
-## Errors
+**For more details, please head to the official documentation page:**
 
-1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available)
-2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_)
+**[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
 ## License
 

From 702ca1ae32e022314f766a16b34888314f294570 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 9 Apr 2024 22:41:26 +0000
Subject: [PATCH 101/234] fix PEP errors

---
 Makefile                                     | 154 -----
 benchmarking/accuracy/bnb_accuracy.py        |   9 +-
 bitsandbytes/archive_functional.py           | 641 ++++++++++---------
 bitsandbytes/cuda_setup/main.py              | 296 +++++----
 bitsandbytes/functional.py                   |  28 +-
 bitsandbytes/nn/modules.py                   |   7 +-
 bitsandbytes/research/autograd/_functions.py |   2 +-
 csrc/kernels.hip                             |  94 +--
 csrc/pythonInterface.cpp                     |   4 +-
 install_cuda.py                              |   8 +-
 tests/helpers.py                             |   2 +-
 tests/test_autograd.py                       |   2 -
 tests/test_cuda_setup_evaluator.py           |   2 +-
 tests/test_functional.py                     |  18 +-
 tests/test_generation.py                     |   4 +-
 tests/test_linear8bitlt.py                   |   2 +-
 tests/test_optim.py                          |   1 -
 tests/test_triton.py                         |   3 +-
 18 files changed, 625 insertions(+), 652 deletions(-)
 delete mode 100644 Makefile

diff --git a/Makefile b/Makefile
deleted file mode 100644
index 00f5869b3..000000000
--- a/Makefile
+++ /dev/null
@@ -1,154 +0,0 @@
-MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
-
-GPP:= /usr/bin/g++
-#GPP:= /sw/gcc/11.2.0/bin/g++
-ifeq ($(CUDA_HOME),)
-	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
-endif
-
-ROCM_HOME := /opt/rocm
-
-ifndef CUDA_VERSION
-ifneq ($(MAKECMDGOALS),clean)
-$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
-CUDA_VERSION:=
-endif
-endif
-
-
-
-NVCC := $(CUDA_HOME)/bin/nvcc
-HIPCC := $(ROCM_HOME)/bin/hipcc
-
-###########################################
-
-CSRC := $(ROOT_DIR)/csrc
-BUILD_DIR:= $(ROOT_DIR)/build
-
-FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
-FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
-
-INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
-
-INCLUDE_ROCM := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-LIB_ROCM := -L $(ROCM_HOME)/lib -lhipblas -lhipblaslt -lhiprand -lhipsparse -L $(CONDA_PREFIX)/lib
-
-# NVIDIA NVCC compilation flags
-COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-
-CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
-CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
-
-# Later versions of CUDA support the new architectures
-CC_CUDA11x := -gencode arch=compute_75,code=sm_75
-CC_CUDA11x += -gencode arch=compute_80,code=sm_80
-CC_CUDA11x += -gencode arch=compute_86,code=sm_86
-
-
-CC_cublasLt110 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt110 += -gencode arch=compute_80,code=sm_80
-
-CC_cublasLt111 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
-CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
-
-CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
-CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
-
-
-all: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda110_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-
-cuda110_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda118_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER)  -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda12x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda110: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda11x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda118: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-hip: $(BUILD_DIR) env
-	$(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/ops.hip -o $(BUILD_DIR)/ops.o
-	$(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/kernels.hip -o $(BUILD_DIR)/kernels.o
-	$(GPP) -std=c++14 -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ -DBUILD_HIP -shared -fPIC $(INCLUDE_ROCM) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so $(LIB_ROCM)
-
-cuda12x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cpuonly: $(BUILD_DIR) env
-	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
-
-env:
-	@echo "ENVIRONMENT"
-	@echo "============================"
-	@echo "CUDA_VERSION: $(CUDA_VERSION)"
-	@echo "============================"
-	@echo "NVCC path: $(NVCC)"
-	@echo "HIPCC path: $(HIPCC)"
-	@echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`"
-	@echo "CUDA_HOME: $(CUDA_HOME)"
-	@echo "HIP_HOME: $(HIP_HOME)"
-	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
-	@echo "PATH: $(PATH)"
-	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
-	@echo "============================"
-
-$(BUILD_DIR):
-	mkdir -p build
-	mkdir -p dependencies
-
-$(ROOT_DIR)/dependencies/cub:
-	git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub
-	cd dependencies/cub; git checkout 1.11.0
-
-clean:
-	rm -rf build/* *.egg*
-	rm -f bitsandbytes/libbitsandbytes*.so
diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py
index bd3b81db4..2860338ec 100644
--- a/benchmarking/accuracy/bnb_accuracy.py
+++ b/benchmarking/accuracy/bnb_accuracy.py
@@ -1,8 +1,6 @@
 import torch
-import bitsandbytes as bnb
-from bitsandbytes import functional as F
-
 
+from bitsandbytes import functional as F
 
 
 def debug_blocksize(block):
@@ -11,6 +9,7 @@ def debug_blocksize(block):
     dq = F.dequantize_fp4(qx, qstate)
     return torch.sum(torch.linalg.norm(x - dq, ord="fro"))
 
+
 def test_blocksize(block):
     x = torch.randn(10, 10).cuda()
     qx, qstate = F.quantize_fp4(x, blocksize=block)
@@ -20,10 +19,8 @@ def test_blocksize(block):
     print("---------------")
     print(qstate)
 
-    
-
 
 for block in [128, 256, 512, 1024, 2048]:
     print(debug_blocksize(block))
 
-#test_blocksize(2048)
+# test_blocksize(2048)
diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
index 226c9e51f..dac7430ed 100644
--- a/bitsandbytes/archive_functional.py
+++ b/bitsandbytes/archive_functional.py
@@ -3,17 +3,14 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import ctypes as ct
+from functools import reduce  # Required in Python 3
 import itertools
 import operator
-import random
-import torch
-import itertools
-import math
-from scipy.stats import norm
-import numpy as np
-
-from functools import reduce  # Required in Python 3
 from typing import Tuple
+
+import numpy as np
+from scipy.stats import norm
+import torch
 from torch import Tensor
 
 from .cextension import COMPILED_WITH_CUDA, lib
@@ -23,12 +20,13 @@
 def prod(iterable):
     return reduce(operator.mul, iterable, 1)
 
+
 name2qmap = {}
 
 if COMPILED_WITH_CUDA:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {}
-    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) #, lib.cadam32bit_grad_bf16)
+    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16)  # , lib.cadam32bit_grad_bf16)
     str2optimizer32bit["momentum"] = (
         lib.cmomentum32bit_grad_32,
         lib.cmomentum32bit_grad_16,
@@ -37,7 +35,7 @@ def prod(iterable):
         lib.crmsprop32bit_grad_32,
         lib.crmsprop32bit_grad_16,
     )
-    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) #, lib.clion32bit_grad_bf16)
+    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16)  # , lib.clion32bit_grad_bf16)
     str2optimizer32bit["adagrad"] = (
         lib.cadagrad32bit_grad_32,
         lib.cadagrad32bit_grad_16,
@@ -73,7 +71,7 @@ def prod(iterable):
     str2optimizer8bit_blockwise["adam"] = (
         lib.cadam_8bit_blockwise_grad_fp32,
         lib.cadam_8bit_blockwise_grad_fp16,
-        #lib.cadam_8bit_blockwise_grad_bf16,
+        # lib.cadam_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["momentum"] = (
         lib.cmomentum_8bit_blockwise_grad_fp32,
@@ -86,13 +84,14 @@ def prod(iterable):
     str2optimizer8bit_blockwise["lion"] = (
         lib.clion_8bit_blockwise_grad_fp32,
         lib.clion_8bit_blockwise_grad_fp16,
-        #lib.clion_8bit_blockwise_grad_bf16,
+        # lib.clion_8bit_blockwise_grad_bf16,
     )
     str2optimizer8bit_blockwise["adagrad"] = (
         lib.cadagrad_8bit_blockwise_grad_fp32,
         lib.cadagrad_8bit_blockwise_grad_fp16,
     )
 
+
 class GlobalPageManager:
     _instance = None
 
@@ -110,14 +109,13 @@ def get_instance(cls):
         return cls._instance
 
     def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be hte
+        # assume the first added, will be the
         # ones that are used first, so swap them in last
         # in the case they are evicted again
         for t in self.paged_tensors[::-1]:
             prefetch_tensor(t, to_cpu)
 
 
-
 class CUBLAS_Context:
     _instance = None
 
@@ -150,7 +148,7 @@ def __init__(self):
         raise RuntimeError("Call get_instance() instead")
 
     def initialize(self):
-        #self.context = ct.c_void_p(lib.get_cusparse())
+        # self.context = ct.c_void_p(lib.get_cusparse())
         if torch.version.cuda:
             self.context = ct.c_void_p(lib.get_cusparse())
         elif torch.version.hip:
@@ -163,6 +161,7 @@ def get_instance(cls):
             cls._instance.initialize()
         return cls._instance
 
+
 dtype2bytes = {}
 dtype2bytes[torch.float32] = 4
 dtype2bytes[torch.float16] = 2
@@ -170,8 +169,9 @@ def get_instance(cls):
 dtype2bytes[torch.uint8] = 1
 dtype2bytes[torch.int8] = 1
 
-def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)):
-    num_bytes = dtype2bytes[dtype]*prod(shape)
+
+def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)):
+    num_bytes = dtype2bytes[dtype] * prod(shape)
     cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
     c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
     new_array = np.ctypeslib.as_array(c_ptr, shape=shape)
@@ -180,74 +180,86 @@ def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0))
     out.page_deviceid = device.index
     return out
 
+
 def prefetch_tensor(A, to_cpu=False):
-    assert A.is_paged, 'Only paged tensors can be prefetched!'
+    assert A.is_paged, "Only paged tensors can be prefetched!"
     if to_cpu:
         deviceid = -1
     else:
         deviceid = A.page_deviceid
 
-    num_bytes = dtype2bytes[A.dtype]*A.numel()
+    num_bytes = dtype2bytes[A.dtype] * A.numel()
     lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid))
 
+
 def elementwise_func(func_name, A, B, value, prefetch=True):
     func = None
     if A.dtype == torch.float32:
-        func = getattr(lib, f'c{func_name}_fp32', None)
+        func = getattr(lib, f"c{func_name}_fp32", None)
         cvalue = ct.c_float(value)
     elif A.dtype == torch.uint8:
-        func = getattr(lib, f'c{func_name}_uint8', None)
+        func = getattr(lib, f"c{func_name}_uint8", None)
         cvalue = ct.c_uint8(value)
 
-    if func is None: raise NotImplementedError(f'Function not implemented: {func_name}')
+    if func is None:
+        raise NotImplementedError(f"Function not implemented: {func_name}")
 
-    is_managed = getattr(A, 'is_managed', False)
+    is_managed = getattr(A, "is_managed", False)
     if is_managed and prefetch:
         prefetch_tensor(A)
-        if B is not None: prefetch_tensor(B)
+        if B is not None:
+            prefetch_tensor(B)
 
     func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel()))
     if A.is_paged or B.is_paged:
         # paged function are fully asynchronous
         # if we return from this function, we want to the tensor
         # to be in the correct state, that is the final state after the
-        # operation occured. So we synchronize.
+        # operation occurred. So we synchronize.
         torch.cuda.synchronize()
 
-def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
-def arange(A, device=None): elementwise_func('arange', A, None, 0)
-def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0)
+
+def fill(A, value, device=None, prefetch=True):
+    elementwise_func("fill", A, None, value)
+
+
+def arange(A, device=None):
+    elementwise_func("arange", A, None, 0)
+
+
+def _mul(A, B, device=None):
+    elementwise_func("_mul", A, B, 0)
 
 
 def create_linear_map(signed=True, total_bits=8, add_zero=True):
-    sign = (-1.0 if signed else 0.0)
+    sign = -1.0 if signed else 0.0
     total_values = 2**total_bits
     if add_zero or total_bits < 8:
         # add a zero
         # since we simulate less bits by having zeros in the data type, we
         # we need to center the quantization around zero and as such lose
         # a single value
-        total_values = (2**total_bits if not signed else 2**total_bits-1)
+        total_values = 2**total_bits if not signed else 2**total_bits - 1
 
     values = torch.linspace(sign, 1.0, total_values)
     gap = 256 - values.numel()
     if gap == 0:
         return values
     else:
-        l = values.numel()//2
-        return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist())
+        l = values.numel() // 2
+        return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist())
 
-def create_normal_map(offset=0.9677083, use_extra_value=True):
 
+def create_normal_map(offset=0.9677083, use_extra_value=True):
     if use_extra_value:
         # one more positive value, this is an asymmetric type
         v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist()
-        v2 = [0]*(256-15) ## we have 15 non-zero values in this data type
+        v2 = [0] * (256 - 15)  ## we have 15 non-zero values in this data type
         v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
         v = v1 + v2 + v3
     else:
         v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist()
-        v2 = [0]*(256-14) ## we have 14 non-zero values in this data type
+        v2 = [0] * (256 - 14)  ## we have 14 non-zero values in this data type
         v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
         v = v1 + v2 + v3
 
@@ -257,38 +269,37 @@ def create_normal_map(offset=0.9677083, use_extra_value=True):
     assert values.numel() == 256
     return values
 
+
 def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8):
     e = exponent_bits
     p = precision_bits
     has_sign = 1 if signed else 0
-    assert e+p == total_bits-has_sign
+    assert e + p == total_bits - has_sign
     # the exponent is biased to 2^(e-1) -1 == 0
     evalues = []
     pvalues = []
-    for i, val in enumerate(range(-((2**(exponent_bits-has_sign))), 2**(exponent_bits-has_sign), 1)):
+    for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)):
         evalues.append(2**val)
 
-
     values = []
     lst = list(itertools.product([0, 1], repeat=precision_bits))
-    #for ev in evalues:
-    bias = 2**(exponent_bits-1)
-    for evalue in range(2**(exponent_bits)):
+    # for ev in evalues:
+    bias = 2 ** (exponent_bits - 1)
+    for evalue in range(2 ** (exponent_bits)):
         for bit_pattern in lst:
-            value = (1 if evalue != 0 else 0)
+            value = 1 if evalue != 0 else 0
             for i, pval in enumerate(list(bit_pattern)):
-                value += pval*(2**-(i+1))
+                value += pval * (2 ** -(i + 1))
             if evalue == 0:
                 # subnormals
-                value = value*2**-(bias)
+                value = value * 2**-(bias)
             else:
                 # normals
-                value = value*2**-(evalue-bias-1)
+                value = value * 2 ** -(evalue - bias - 1)
             values.append(value)
             if signed:
                 values.append(-value)
 
-
     assert len(values) == 2**total_bits
     values.sort()
     if total_bits < 8:
@@ -302,7 +313,6 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8)
     return code
 
 
-
 def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
     """
     Creates the dynamic quantiztion map.
@@ -329,7 +339,11 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
     if not signed:
         additional_items = 2 * additional_items
     for i in range(max_exponent_bits):
-        fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1))
+        fraction_items = int(
+            2 ** (i + non_sign_bits - max_exponent_bits) + 1
+            if signed
+            else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1
+        )
         boundaries = torch.linspace(0.1, 1, fraction_items)
         means = (boundaries[:-1] + boundaries[1:]) / 2.0
         data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
@@ -353,8 +367,9 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
     data.sort()
     return Tensor(data)
 
+
 def create_quantile_map(A, total_bits=8):
-    q = estimate_quantiles(A, num_quantiles=2**total_bits-1)
+    q = estimate_quantiles(A, num_quantiles=2**total_bits - 1)
     q = q.tolist()
     q.append(0)
 
@@ -365,11 +380,13 @@ def create_quantile_map(A, total_bits=8):
     q.sort()
 
     q = Tensor(q)
-    q = q/q.abs().max()
+    q = q / q.abs().max()
     return q
 
+
 def get_special_format_str():
-    if not torch.cuda.is_available(): return 'col_turing'
+    if not torch.cuda.is_available():
+        return "col_turing"
     major, _minor = torch.cuda.get_device_capability()
     if major <= 7:
         return "col_turing"
@@ -378,22 +395,27 @@ def get_special_format_str():
     return "col_turing"
 
 
-
 def is_on_gpu(tensors):
     on_gpu = True
     gpu_ids = set()
     for t in tensors:
-        if t is None: continue # NULL pointers are fine
-        is_paged = getattr(t, 'is_paged', False)
-        on_gpu &= (t.device.type == 'cuda' or is_paged)
+        if t is None:
+            continue  # NULL pointers are fine
+        is_paged = getattr(t, "is_paged", False)
+        on_gpu &= t.device.type == "cuda" or is_paged
         if not is_paged:
             gpu_ids.add(t.device.index)
     if not on_gpu:
-        raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}')
+        raise TypeError(
+            f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}"
+        )
     if len(gpu_ids) > 1:
-        raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}')
+        raise TypeError(
+            f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}"
+        )
     return on_gpu
 
+
 def get_ptr(A: Tensor) -> ct.c_void_p:
     """
     Get the ctypes pointer from a PyTorch Tensor.
@@ -434,9 +456,7 @@ def get_transform_func(dtype, orderA, orderOut, transpose=False):
         return getattr(lib, name)
 
 
-def get_transform_buffer(
-    shape, dtype, device, to_order, from_order="row", transpose=False
-):
+def get_transform_buffer(shape, dtype, device, to_order, from_order="row", transpose=False):
     # init_func = torch.empty
     init_func = torch.zeros
     dims = len(shape)
@@ -489,9 +509,7 @@ def nvidia_transform(
     else:
         from_order = state[1]
     if out is None:
-        out, new_state = get_transform_buffer(
-            state[0], A.dtype, A.device, to_order, state[1]
-        )
+        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1])
     else:
         new_state = (state[1], to_order)
     func = get_transform_func(A.dtype, from_order, to_order, transpose)
@@ -516,7 +534,7 @@ def nvidia_transform(
 
 
 def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor:
-    '''
+    """
     Estimates 256 equidistant quantiles on the input tensor eCDF.
 
     Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
@@ -543,14 +561,21 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n
     -------
     torch.Tensor:
         The 256 quantiles in float32 datatype.
-    '''
-    if A.numel() < 256: raise NotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.')
-    if num_quantiles > 256: raise NotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}")
-    if num_quantiles < 256 and offset == 1/(512):
+    """
+    if A.numel() < 256:
+        raise NotImplementedError(
+            f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values."
+        )
+    if num_quantiles > 256:
+        raise NotImplementedError(
+            f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}"
+        )
+    if num_quantiles < 256 and offset == 1 / (512):
         # override default arguments
-        offset = 1/(2*num_quantiles)
+        offset = 1 / (2 * num_quantiles)
 
-    if out is None: out = torch.zeros((256,), dtype=torch.float32, device=A.device)
+    if out is None:
+        out = torch.zeros((256,), dtype=torch.float32, device=A.device)
     is_on_gpu([A, out])
     device = pre_call(A.device)
     if A.dtype == torch.float32:
@@ -562,14 +587,16 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n
     post_call(device)
 
     if num_quantiles < 256:
-        step = round(256/num_quantiles)
+        step = round(256 / num_quantiles)
         idx = torch.linspace(0, 255, num_quantiles).long().to(A.device)
         out = out[idx]
 
     return out
 
 
-def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor:
+def quantize_blockwise(
+    A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False
+) -> Tensor:
     """
     Quantize tensor A in blocks of size 4096 values.
 
@@ -596,7 +623,6 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
         The quantization state to undo the quantization.
     """
 
-
     if code is None:
         if "dynamic" not in name2qmap:
             name2qmap["dynamic"] = create_dynamic_map().to(A.device)
@@ -611,23 +637,34 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
     if out is None:
         out = torch.zeros_like(A, dtype=torch.uint8)
 
-    if A.device.type != 'cpu':
+    if A.device.type != "cpu":
         assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
         cblocksize = ct.c_int32(blocksize)
         prev_device = pre_call(A.device)
         code = code.to(A.device)
         is_on_gpu([code, A, out, absmax])
         if A.dtype == torch.float32:
-            lib.cquantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
+            lib.cquantize_blockwise_fp32(
+                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
+            )
         elif A.dtype == torch.float16:
-            lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
+            lib.cquantize_blockwise_fp16(
+                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
+            )
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
         post_call(A.device)
     else:
         # cpu
         code = code.cpu()
-        lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+        lib.cquantize_blockwise_cpu_fp32(
+            get_ptr(code),
+            get_ptr(A),
+            get_ptr(absmax),
+            get_ptr(out),
+            ct.c_longlong(blocksize),
+            ct.c_longlong(A.numel()),
+        )
 
     if nested:
         offset = absmax.mean()
@@ -637,8 +674,6 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
     else:
         state = [absmax, code, blocksize, nested, None, None]
 
-
-
     return out, state
 
 
@@ -649,7 +684,7 @@ def dequantize_blockwise(
     code: Tensor = None,
     out: Tensor = None,
     blocksize: int = 4096,
-    nested=False
+    nested=False,
 ) -> Tensor:
     """
     Dequantizes blockwise quantized values.
@@ -686,41 +721,58 @@ def dequantize_blockwise(
         out = torch.zeros_like(A, dtype=torch.float32)
 
     if quant_state is None:
-       quant_state = (absmax, code, blocksize)
-       assert absmax is not None and out is not None
+        quant_state = (absmax, code, blocksize)
+        assert absmax is not None and out is not None
     else:
-       absmax, code, blocksize, nested, offset, state2 = quant_state
-       if nested:
-           absmax = dequantize_blockwise(absmax, state2)
-           absmax += offset
-
+        absmax, code, blocksize, nested, offset, state2 = quant_state
+        if nested:
+            absmax = dequantize_blockwise(absmax, state2)
+            absmax += offset
 
-    if A.device.type != 'cpu':
+    if A.device.type != "cpu":
         device = pre_call(A.device)
         code = code.to(A.device)
         if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+            raise ValueError(
+                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+            )
         is_on_gpu([A, absmax, out])
         if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp32(
+                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
+            )
         elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp16(
+                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
+            )
         else:
             raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
         post_call(A.device)
     else:
         code = code.cpu()
-        lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+        lib.cdequantize_blockwise_cpu_fp32(
+            get_ptr(quant_state[1]),
+            get_ptr(A),
+            get_ptr(quant_state[0]),
+            get_ptr(out),
+            ct.c_longlong(blocksize),
+            ct.c_longlong(A.numel()),
+        )
 
     return out
 
+
 def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4")
+
 
 def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4')
+    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4")
+
 
-def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor:
+def quantize_4bit(
+    A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type="fp4"
+) -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
 
@@ -746,10 +798,10 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    if A.device.type != 'cuda':
-        raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
-    if quant_type not in ['fp4', 'nf4']:
-        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+    if A.device.type != "cuda":
+        raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
+    if quant_type not in ["fp4", "nf4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
 
     n = A.numel()
     input_shape = A.shape
@@ -759,9 +811,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
         blocks += 1 if n % blocksize > 0 else 0
         absmax = torch.zeros((blocks,), device=A.device)
 
-
     if out is None:
-        out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device)
+        out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device)
 
     assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
 
@@ -769,15 +820,23 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     is_on_gpu([A, out, absmax])
 
     if A.dtype == torch.float32:
-        if quant_type == 'fp4':
-            lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        if quant_type == "fp4":
+            lib.cquantize_blockwise_fp32_fp4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+            )
         else:
-            lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            lib.cquantize_blockwise_fp32_nf4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+            )
     elif A.dtype == torch.float16:
-        if quant_type == 'fp4':
-            lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+        if quant_type == "fp4":
+            lib.cquantize_blockwise_fp16_fp4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+            )
         else:
-            lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
+            lib.cquantize_blockwise_fp16_nf4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
+            )
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
     post_call(A.device)
@@ -785,8 +844,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
     if compress_statistics:
         offset = absmax.mean()
         absmax -= offset
-        #code = create_custom_map().to(absmax.device)
-        #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
+        # code = create_custom_map().to(absmax.device)
+        # qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
         qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
         del absmax
         state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
@@ -795,13 +854,35 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
 
     return out, state
 
-def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
 
-def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
+def dequantize_fp4(
+    A: Tensor,
+    quant_state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
-def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+
+def dequantize_nf4(
+    A: Tensor,
+    quant_state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+) -> Tensor:
+    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
+
+
+def dequantize_4bit(
+    A: Tensor,
+    quant_state: Tuple[Tensor, Tensor] = None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+    quant_type="fp4",
+) -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.
 
@@ -829,9 +910,11 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
         Dequantized tensor.
     """
     if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-        raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
-    if quant_type not in ['fp4', 'nf4']:
-        raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
+        raise ValueError(
+            f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+        )
+    if quant_type not in ["fp4", "nf4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
 
     if quant_state is None:
         assert absmax is not None and out is not None
@@ -840,7 +923,6 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
     else:
         absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state
 
-
     if compressed_stats is not None:
         offset, state2 = compressed_stats
         absmax = dequantize_blockwise(absmax, state2)
@@ -851,26 +933,35 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
 
     n = out.numel()
 
-
     device = pre_call(A.device)
     is_on_gpu([A, absmax, out])
     if out.dtype == torch.float32:
-        if quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+        if quant_type == "fp4":
+            lib.cdequantize_blockwise_fp32_fp4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
+            )
         else:
-            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp32_nf4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
+            )
     elif out.dtype == torch.float16:
-        if quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+        if quant_type == "fp4":
+            lib.cdequantize_blockwise_fp16_fp4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
+            )
         else:
-            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp16_nf4(
+                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
+            )
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
     post_call(A.device)
 
-    is_transposed = (True if A.shape[0] == 1 else False)
-    if is_transposed: return out.t()
-    else: return out
+    is_transposed = True if A.shape[0] == 1 else False
+    if is_transposed:
+        return out.t()
+    else:
+        return out
 
 
 def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
@@ -907,7 +998,7 @@ def dequantize(
 
 
 def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    '''
+    """
     Quantizes input tensor to 8-bit.
 
     Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
@@ -926,9 +1017,10 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
     -------
     torch.Tensor:
         Quantized 8-bit tensor.
-    '''
+    """
     prev_device = pre_call(A.device)
-    if out is None: out = torch.zeros_like(A, dtype=torch.uint8)
+    if out is None:
+        out = torch.zeros_like(A, dtype=torch.uint8)
     is_on_gpu([A, out])
     lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
     post_call(prev_device)
@@ -936,7 +1028,7 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
 
 
 def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    '''
+    """
     Dequantizes the 8-bit tensor to 32-bit.
 
     Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
@@ -955,9 +1047,10 @@ def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
     -------
     torch.Tensor:
         32-bit output tensor.
-    '''
+    """
     prev_device = pre_call(A.device)
-    if out is None: out = torch.zeros_like(A, dtype=torch.float32)
+    if out is None:
+        out = torch.zeros_like(A, dtype=torch.float32)
     is_on_gpu([code, A, out])
     lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
     post_call(prev_device)
@@ -1024,16 +1117,17 @@ def optimizer_update_32bit(
     if max_unorm > 0.0:
         param_norm = torch.norm(p.data.float())
 
-
     optim_func = None
     if g.dtype == torch.float32:
         optim_func = str2optimizer32bit[optimizer_name][0]
     elif g.dtype == torch.float16:
         optim_func = str2optimizer32bit[optimizer_name][1]
-    elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3):
+    elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
         optim_func = str2optimizer32bit[optimizer_name][2]
     else:
-        raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}")
+        raise ValueError(
+            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
+        )
 
     is_on_gpu([g, p, state1, state2, unorm_vec])
     prev_device = pre_call(g.device)
@@ -1053,7 +1147,8 @@ def optimizer_update_32bit(
         ct.c_float(lr),
         ct.c_float(gnorm_scale),
         ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()))
+        ct.c_int32(g.numel()),
+    )
     post_call(prev_device)
 
 
@@ -1209,7 +1304,6 @@ def optimizer_update_8bit_blockwise(
     gnorm_scale: float = 1.0,
     skip_zeros=False,
 ) -> None:
-
     optim_func = None
     prev_device = pre_call(g.device)
     is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
@@ -1217,8 +1311,11 @@ def optimizer_update_8bit_blockwise(
         optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
     elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
         optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
-    elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and
-          len(str2optimizer8bit_blockwise[optimizer_name])==3):
+    elif (
+        g.dtype == torch.bfloat16
+        and state1.dtype == torch.uint8
+        and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
+    ):
         optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
     else:
         raise ValueError(
@@ -1250,9 +1347,8 @@ def optimizer_update_8bit_blockwise(
     )
     post_call(prev_device)
 
-def percentile_clipping(
-    grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5
-):
+
+def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
     """Applies percentile clipping
 
     grad: torch.Tensor
@@ -1294,9 +1390,7 @@ def percentile_clipping(
     return current_gnorm, clip_value, gnorm_scale
 
 
-def histogram_scatter_add_2d(
-    histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor
-):
+def histogram_scatter_add_2d(histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor):
     assert len(histogram.shape) == 2
     assert histogram.dtype == torch.float32
     assert source.dtype == torch.float32
@@ -1313,12 +1407,12 @@ def histogram_scatter_add_2d(
     is_on_gpu([histogram, index1, index2, source])
     lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
 
+
 def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
-    if not torch.cuda.is_initialized(): torch.cuda.init()
+    if not torch.cuda.is_initialized():
+        torch.cuda.init()
     if A.dtype != expected_type or B.dtype != expected_type:
-        raise TypeError(
-            f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}"
-        )
+        raise TypeError(f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}")
 
     sA = A.shape
     sB = B.shape
@@ -1359,12 +1453,7 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8
         sout = out.shape
         # special case common in backprop
         if not correct and len(sA) == 3 and len(sB) == 3:
-            if (
-                sout[0] == sA[2]
-                and sout[1] == sB[2]
-                and sA[0] == sB[0]
-                and sA[1] == sB[1]
-            ):
+            if sout[0] == sA[2] and sout[1] == sB[2] and sA[0] == sB[0] and sA[1] == sB[1]:
                 correct = True
     else:
         if len(sA) == 2 and len(sB) == 2:
@@ -1402,15 +1491,9 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8
 
     return sout
 
-def cutlass3_gemm(
-    A: Tensor,
-    B: Tensor,
-    out: Tensor = None,
-    transposed_A=False,
-    transposed_B=False,
-    state=None
-):
-    #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
+
+def cutlass3_gemm(A: Tensor, B: Tensor, out: Tensor = None, transposed_A=False, transposed_B=False, state=None):
+    # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
     if state is None:
         Bshape = B.shape
         bout = Bshape[1]
@@ -1489,15 +1572,15 @@ def cutlass3_gemm(
 
     # B^T @ A^T = C^T
     # [km, nk -> mn]
-    #lda = ldb = ldc = 1
-    #lda = 1
+    # lda = ldb = ldc = 1
+    # lda = 1
     if state is not None:
         m = Bshape[0]
         k = Bshape[1]
         lda = Bshape[0]
         ldc = Bshape[0]
-        ldb = (ldb+1)//2
-    #print(m, n, k, lda, ldb, ldc)
+        ldb = (ldb + 1) // 2
+    # print(m, n, k, lda, ldb, ldc)
     is_on_gpu([B, A, out])
     m = ct.c_int32(m)
     n = ct.c_int32(n)
@@ -1507,19 +1590,19 @@ def cutlass3_gemm(
     ldc = ct.c_int32(ldc)
 
     if B.dtype == torch.uint8:
-        lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+        lib.cgemm_4bit_inference(
+            m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3])
+        )
     elif A.dtype == torch.float32:
         lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
     elif A.dtype == torch.float16:
         lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
     else:
-        raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}')
+        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
 
     return out
 
 
-
-
 def igemm(
     A: Tensor,
     B: Tensor,
@@ -1604,8 +1687,20 @@ def igemm(
     # B^T @ A^T = C^T
     # [km, nk -> mn]
     is_on_gpu([B, A, out])
-    lib.cigemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
-               get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc))
+    lib.cigemm(
+        ptr,
+        ct.c_bool(transposed_B),
+        ct.c_bool(transposed_A),
+        ct.c_int32(m),
+        ct.c_int32(n),
+        ct.c_int32(k),
+        get_ptr(B),
+        get_ptr(A),
+        get_ptr(out),
+        ct.c_int32(lda),
+        ct.c_int32(ldb),
+        ct.c_int32(ldc),
+    )
     return out
 
 
@@ -1617,9 +1712,7 @@ def batched_igemm(
     transposed_B=False,
 ):
     if not len(A.shape) == 3 or not len(B.shape) == 3:
-        raise ValueError(
-            f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}"
-        )
+        raise ValueError(f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}")
     sout = check_matmul(A, B, out, transposed_A, transposed_B)
     if out is None:
         out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
@@ -1686,9 +1779,24 @@ def batched_igemm(
     ptr = CUBLAS_Context.get_instance().get_context(A.device)
 
     is_on_gpu([B, A, out])
-    lib.cbatched_igemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k),
-               get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc),
-               ct.c_long(strideA), ct.c_long(strideB), ct.c_long(strideC), ct.c_uint32(num_batch))
+    lib.cbatched_igemm(
+        ptr,
+        ct.c_bool(transposed_B),
+        ct.c_bool(transposed_A),
+        ct.c_int32(m),
+        ct.c_int32(n),
+        ct.c_int32(k),
+        get_ptr(B),
+        get_ptr(A),
+        get_ptr(out),
+        ct.c_int32(lda),
+        ct.c_int32(ldb),
+        ct.c_int32(ldc),
+        ct.c_long(strideA),
+        ct.c_long(strideB),
+        ct.c_long(strideC),
+        ct.c_uint32(num_batch),
+    )
     return out
 
 
@@ -1697,14 +1805,14 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     shapeB = SB[0]
     dimsA = len(shapeA)
     dimsB = len(shapeB)
-    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+    assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
     if dimsA == 2:
         m = shapeA[0]
     elif dimsA == 3:
         m = shapeA[0] * shapeA[1]
 
     rows = n = shapeB[0]
-    assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}'
+    assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}"
 
     # if the tensor is empty, return a transformed empty tensor with the right dimensions
     if shapeA[0] == 0 and dimsA == 2:
@@ -1713,13 +1821,9 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
     if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
     elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
 
     assert dimsB != 3, "len(B.shape)==3 not supported"
     assert A.device.type == "cuda"
@@ -1761,46 +1865,30 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     has_error = 0
     ptrRowScale = get_ptr(None)
     is_on_gpu([A, B, out])
-    if formatB == 'col_turing':
+    if formatB == "col_turing":
         if dtype == torch.int32:
-            has_error = lib.cigemmlt_turing_32(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
+            has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
         else:
-            has_error = lib.cigemmlt_turing_8(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
+            has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
     elif formatB == "col_ampere":
         if dtype == torch.int32:
-            has_error = lib.cigemmlt_ampere_32(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
+            has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
         else:
-            has_error = lib.cigemmlt_ampere_8(
-                ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
-            )
+            has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
     if has_error == 1:
-        print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}')
-        raise Exception('cublasLt ran into an error!')
+        print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}")
+        raise Exception("cublasLt ran into an error!")
 
     torch.cuda.set_device(prev_device)
 
     return out, Sout
 
 
-def mm_dequant(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None
-):
+def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None):
     assert A.dtype == torch.int32
-    if bias is not None: assert bias.dtype == torch.float16
+    if bias is not None:
+        assert bias.dtype == torch.float16
     out_shape = quant_state[0]
     if len(out_shape) == 3:
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
@@ -1808,19 +1896,11 @@ def mm_dequant(
     if out is None:
         out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
     if new_row_stats is None:
-        new_row_stats = torch.empty(
-            out_shape[0], dtype=torch.float32, device=A.device
-        )
+        new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device)
     if new_col_stats is None:
-        new_col_stats = torch.empty(
-            out_shape[1], dtype=torch.float32, device=A.device
-        )
-    assert (
-        new_row_stats.shape[0] == row_stats.shape[0]
-    ), f"{new_row_stats.shape} vs {row_stats.shape}"
-    assert (
-        new_col_stats.shape[0] == col_stats.shape[0]
-    ), f"{new_col_stats.shape} vs {col_stats.shape}"
+        new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device)
+    assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}"
+    assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}"
 
     prev_device = pre_call(A.device)
     ptrA = get_ptr(A)
@@ -1834,15 +1914,15 @@ def mm_dequant(
     numCols = ct.c_int32(out_shape[1])
 
     is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-    lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols)
+    lib.cdequant_mm_int32_fp16(
+        ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols
+    )
     post_call(prev_device)
 
     return out
 
 
-def get_colrow_absmax(
-    A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0
-):
+def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0):
     assert A.dtype == torch.float16
     device = A.device
 
@@ -1855,18 +1935,12 @@ def get_colrow_absmax(
     col_tiles = (cols + 255) // 256
     tiled_rows = ((rows + 15) // 16) * 16
     if row_stats is None:
-        row_stats = torch.empty(
-            (rows,), dtype=torch.float32, device=device
-        ).fill_(-50000.0)
+        row_stats = torch.empty((rows,), dtype=torch.float32, device=device).fill_(-50000.0)
     if col_stats is None:
-        col_stats = torch.empty(
-            (cols,), dtype=torch.float32, device=device
-        ).fill_(-50000.0)
+        col_stats = torch.empty((cols,), dtype=torch.float32, device=device).fill_(-50000.0)
 
     if nnz_block_ptr is None and threshold > 0.0:
-        nnz_block_ptr = torch.zeros(
-            ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device
-        )
+        nnz_block_ptr = torch.zeros(((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device)
 
     ptrA = get_ptr(A)
     ptrRowStats = get_ptr(row_stats)
@@ -1940,14 +2014,10 @@ def __init__(self, rows, cols, nnz, colptr, rowidx, values):
 def coo2csr(cooA):
     values, counts = torch.unique(cooA.rowidx, return_counts=True)
     values.add_(1)
-    rowptr = torch.zeros(
-        (cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device
-    )
+    rowptr = torch.zeros((cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device)
     rowptr.scatter_(index=values.long(), src=counts.int(), dim=0)
     rowptr.cumsum_(0)
-    return CSRSparseTensor(
-        cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values
-    )
+    return CSRSparseTensor(cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values)
 
 
 def coo2csc(cooA):
@@ -1956,14 +2026,10 @@ def coo2csc(cooA):
     values = cooA.values[col2rowidx]
     colvalues, counts = torch.unique(val, return_counts=True)
     colvalues.add_(1)
-    colptr = torch.zeros(
-        (cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device
-    )
+    colptr = torch.zeros((cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device)
     colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0)
     colptr.cumsum_(0)
-    return CSCSparseTensor(
-        cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values
-    )
+    return CSCSparseTensor(cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values)
 
 
 def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
@@ -1973,9 +2039,7 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
 
-def double_quant(
-    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-):
+def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     device = A.device
     assert A.dtype == torch.half
     assert device.type == "cuda"
@@ -1988,9 +2052,7 @@ def double_quant(
         rows = A.shape[0]
 
     if row_stats is None or col_stats is None:
-        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(
-            A, threshold=threshold
-        )
+        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold)
 
     if out_col is None:
         out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
@@ -2008,9 +2070,7 @@ def double_quant(
     if threshold > 0.0:
         nnz = nnz_row_ptr[-1].item()
         if nnz > 0:
-            coo_tensor = coo_zeros(
-                A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device
-            )
+            coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device)
             ptrRowIdx = get_ptr(coo_tensor.rowidx)
             ptrColIdx = get_ptr(coo_tensor.colidx)
             ptrVal = get_ptr(coo_tensor.values)
@@ -2069,12 +2129,16 @@ def double_quant(
     return out_row, out_col, row_stats, col_stats, coo_tensor
 
 
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
     prev_device = pre_call(A.device)
-    if state is None: state = (A.shape, from_order)
-    else: from_order = state[1]
-    if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-    else: new_state = (state[0], to_order) # (shape, order)
+    if state is None:
+        state = (A.shape, from_order)
+    else:
+        from_order = state[1]
+    if out is None:
+        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
+    else:
+        new_state = (state[0], to_order)  # (shape, order)
 
     shape = state[0]
     if len(shape) == 2:
@@ -2085,7 +2149,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No
         dim2 = ct.c_int32(shape[2])
 
     is_on_gpu([A, out])
-    if to_order == 'col32':
+    if to_order == "col32":
         if transpose:
             lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
         else:
@@ -2106,7 +2170,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No
         elif from_order == "col_ampere":
             lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
     else:
-        raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}')
+        raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}")
 
     post_call(prev_device)
 
@@ -2115,9 +2179,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No
 
 def spmm_coo(cooA, B, out=None):
     if out is None:
-        out = torch.empty(
-            (cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype
-        )
+        out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype)
     nnz = cooA.nnz
     assert cooA.rowidx.numel() == nnz
     assert cooA.colidx.numel() == nnz
@@ -2144,16 +2206,28 @@ def spmm_coo(cooA, B, out=None):
     cldc = ct.c_int32(ldc)
 
     is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out])
-    lib.cspmm_coo(ptr, ptrRowidx, ptrColidx, ptrValues, cnnz, crowsA, ccolsA, ccolsB, cldb, ptrB, cldc, ptrC, ct.c_bool(transposed_B))
+    lib.cspmm_coo(
+        ptr,
+        ptrRowidx,
+        ptrColidx,
+        ptrValues,
+        cnnz,
+        crowsA,
+        ccolsA,
+        ccolsB,
+        cldb,
+        ptrB,
+        cldc,
+        ptrC,
+        ct.c_bool(transposed_B),
+    )
 
     return out
 
 
 def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     if out is None:
-        out = torch.zeros(
-            (cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype
-        )
+        out = torch.zeros((cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype)
     nnz = cooA.nnz
     prev_device = pre_call(B.device)
     assert cooA.rowidx.numel() == nnz
@@ -2171,9 +2245,7 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     max_count, max_idx = torch.sort(counts, descending=True)
     max_idx = max_idx.int()
     max_count = max_count.int()
-    assert (
-        max_count[0] <= 32
-    ), f"Current max count per row is 8 but found {max_count[0]}."
+    assert max_count[0] <= 32, f"Current max count per row is 8 but found {max_count[0]}."
     assert B.dtype in [torch.float16, torch.int8]
     ptrOffset = get_ptr(offset)
     ptrMaxCount = get_ptr(max_count)
@@ -2261,9 +2333,7 @@ def vectorwise_quant(x, dim=1, quant_type="vector"):
     elif quant_type in ["vector-zeropoint", "row-zeropoint"]:
         dtype = x.dtype
         x = x.float()
-        dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(
-            x, dim=dim, keepdim=True
-        )
+        dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(x, dim=dim, keepdim=True)
         dyna[dyna == 0] = 1
         qx = 255.0 / dyna
         minx = torch.amin(x, dim=dim, keepdim=True)
@@ -2371,9 +2441,7 @@ def extract_outliers(A, SA, idx):
     assert formatA in ["col_turing", "col_ampere"]
     assert A.device.type == "cuda"
 
-    out = torch.zeros(
-        (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device
-    )
+    out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
 
     idx_size = ct.c_int32(idx.numel())
     rows = ct.c_int32(shapeA[0])
@@ -2383,7 +2451,7 @@ def extract_outliers(A, SA, idx):
     ptrOut = get_ptr(out)
 
     prev_device = pre_call(A.device)
-    if formatA == 'col_turing':
+    if formatA == "col_turing":
         lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
     elif formatA == "col_ampere":
         lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
@@ -2391,6 +2459,7 @@ def extract_outliers(A, SA, idx):
 
     return out
 
+
 def pipeline_test(A, batch_size):
     out = torch.zeros_like(A)
     lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index b4962c1a0..b0a790e70 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -17,25 +17,32 @@
 """
 
 import ctypes as ct
-import os
 import errno
-import torch
-from warnings import warn
-from itertools import product
-
+import os
 from pathlib import Path
 from typing import Set, Union
+from warnings import warn
+
+import torch
+
 from .env_vars import get_potentially_lib_path_containing_env_vars
 
 # these are the most common libs names
 # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
 # we have libcudart.so.11.0 which causes a lot of errors before
 # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
-CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2']
+CUDA_RUNTIME_LIBS: list = [
+    "libcudart.so",
+    "libcudart.so.11.0",
+    "libcudart.so.12.0",
+    "libcudart.so.12.1",
+    "libcudart.so.12.2",
+]
 
 # this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
 backup_paths = []
-backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0')
+backup_paths.append("$CONDA_PREFIX/lib/libcudart.so.11.0")
+
 
 class CUDASetup:
     _instance = None
@@ -44,59 +51,89 @@ def __init__(self):
         raise RuntimeError("Call get_instance() instead")
 
     def generate_instructions(self):
-        if getattr(self, 'error', False): return
+        if getattr(self, "error", False):
+            return
         print(self.error)
         self.error = True
         if not self.cuda_available:
-            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed.')
-            self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.')
-            self.add_log_entry('CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null')
-            self.add_log_entry('CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a')
-            self.add_log_entry('CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc')
-            self.add_log_entry('CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)')
+            self.add_log_entry(
+                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed."
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig."
+            )
+            self.add_log_entry("CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:")
+            self.add_log_entry(
+                "CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)"
+            )
             return
 
         if self.cudart_path is None:
-            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.')
-            self.add_log_entry('CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable')
-            self.add_log_entry('CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null')
-            self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
-            self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
-            self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
-            self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
-            self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
+            self.add_log_entry(
+                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected."
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc"
+            )
+            self.add_log_entry("CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.")
+            self.add_log_entry(
+                "CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh"
+            )
+            self.add_log_entry(
+                "CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO."
+            )
+            self.add_log_entry(
+                'CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local'
+            )
 
             return
 
-        make_cmd = f'CUDA_VERSION={self.cuda_version_string}'
+        make_cmd = f"CUDA_VERSION={self.cuda_version_string}"
         if len(self.cuda_version_string) < 3:
-            make_cmd += ' make cuda92'
-        elif self.cuda_version_string == '110':
-            make_cmd += ' make cuda110'
-        elif self.cuda_version_string[:2] == '11' and int(self.cuda_version_string[2]) > 0:
-            make_cmd += ' make cuda11x'
-        elif self.cuda_version_string[:2] == '12' and 1 >= int(self.cuda_version_string[2]) >= 0:
-            make_cmd += ' make cuda12x'
-        elif self.cuda_version_string == '100':
-            self.add_log_entry('CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.')
-            self.add_log_entry('CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables.')
+            make_cmd += " make cuda92"
+        elif self.cuda_version_string == "110":
+            make_cmd += " make cuda110"
+        elif self.cuda_version_string[:2] == "11" and int(self.cuda_version_string[2]) > 0:
+            make_cmd += " make cuda11x"
+        elif self.cuda_version_string[:2] == "12" and 1 >= int(self.cuda_version_string[2]) >= 0:
+            make_cmd += " make cuda12x"
+        elif self.cuda_version_string == "100":
+            self.add_log_entry("CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.")
+            self.add_log_entry(
+                "CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables."
+            )
             return
 
-
         has_cublaslt = is_cublasLt_compatible(self.cc)
         if not has_cublaslt:
-            make_cmd += '_nomatmul'
+            make_cmd += "_nomatmul"
 
-        self.add_log_entry('CUDA SETUP: Something unexpected happened. Please compile from source:')
-        self.add_log_entry('git clone https://github.com/TimDettmers/bitsandbytes.git')
-        self.add_log_entry('cd bitsandbytes')
+        self.add_log_entry("CUDA SETUP: Something unexpected happened. Please compile from source:")
+        self.add_log_entry("git clone https://github.com/TimDettmers/bitsandbytes.git")
+        self.add_log_entry("cd bitsandbytes")
         self.add_log_entry(make_cmd)
-        self.add_log_entry('python setup.py install')
+        self.add_log_entry("python setup.py install")
 
     def initialize(self):
-        if not getattr(self, 'initialized', False):
+        if not getattr(self, "initialized", False):
             self.has_printed = False
             self.lib = None
             self.initialized = False
@@ -104,16 +141,18 @@ def initialize(self):
 
     def manual_override(self):
         if torch.cuda.is_available():
-            if 'BNB_CUDA_VERSION' in os.environ:
-                if len(os.environ['BNB_CUDA_VERSION']) > 0:
-                    warn((f'\n\n{"="*80}\n'
-                          'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
-                          'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
-                          'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
-                          'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
-                          'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
-                          f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
-                          f'\n{"="*80}\n\n'))
+            if "BNB_CUDA_VERSION" in os.environ:
+                if len(os.environ["BNB_CUDA_VERSION"]) > 0:
+                    warn(
+                        f'\n\n{"="*80}\n'
+                        'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
+                        'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
+                        'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
+                        'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
+                        'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
+                        f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
+                        f'\n{"="*80}\n\n'
+                    )
                     self.binary_name = self.binary_name[:-6] + f'{os.environ["BNB_CUDA_VERSION"]}.so'
 
     def run_cuda_setup(self):
@@ -133,32 +172,40 @@ def run_cuda_setup(self):
 
         try:
             if not binary_path.exists():
-                self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
+                self.add_log_entry(
+                    f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?"
+                )
                 legacy_binary_name = "libbitsandbytes_cpu.so"
                 self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
                 binary_path = package_dir / legacy_binary_name
                 if not binary_path.exists() or torch.cuda.is_available():
-                    self.add_log_entry('')
-                    self.add_log_entry('='*48 + 'ERROR' + '='*37)
-                    self.add_log_entry('CUDA SETUP: CUDA detection failed! Possible reasons:')
-                    self.add_log_entry('1. You need to manually override the PyTorch CUDA version. Please see: '
-                             '"https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md')
-                    self.add_log_entry('2. CUDA driver not installed')
-                    self.add_log_entry('3. CUDA not installed')
-                    self.add_log_entry('4. You have multiple conflicting CUDA libraries')
-                    self.add_log_entry('5. Required library not pre-compiled for this bitsandbytes release!')
-                    self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
-                    self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.')
-                    self.add_log_entry('='*80)
-                    self.add_log_entry('')
+                    self.add_log_entry("")
+                    self.add_log_entry("=" * 48 + "ERROR" + "=" * 37)
+                    self.add_log_entry("CUDA SETUP: CUDA detection failed! Possible reasons:")
+                    self.add_log_entry(
+                        "1. You need to manually override the PyTorch CUDA version. Please see: "
+                        '"https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md'
+                    )
+                    self.add_log_entry("2. CUDA driver not installed")
+                    self.add_log_entry("3. CUDA not installed")
+                    self.add_log_entry("4. You have multiple conflicting CUDA libraries")
+                    self.add_log_entry("5. Required library not pre-compiled for this bitsandbytes release!")
+                    self.add_log_entry(
+                        "CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`."
+                    )
+                    self.add_log_entry(
+                        "CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`."
+                    )
+                    self.add_log_entry("=" * 80)
+                    self.add_log_entry("")
                     self.generate_instructions()
-                    raise Exception('CUDA SETUP: Setup Failed!')
+                    raise Exception("CUDA SETUP: Setup Failed!")
                 self.lib = ct.cdll.LoadLibrary(binary_path)
             else:
                 self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
                 self.lib = ct.cdll.LoadLibrary(binary_path)
         except Exception as ex:
-            #debug
+            # debug
             self.add_log_entry("Exception in run_cuda_setup: \n")
             self.add_log_entry(str(ex))
 
@@ -183,14 +230,18 @@ def get_instance(cls):
 def is_cublasLt_compatible(cc):
     has_cublaslt = False
     if cc is not None:
-        cc_major, cc_minor = cc.split('.')
+        cc_major, cc_minor = cc.split(".")
         if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
-            CUDASetup.get_instance().add_log_entry("WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! \
-                    If you run into issues with 8-bit matmul, you can try 4-bit quantization: https://huggingface.co/blog/4bit-transformers-bitsandbytes", is_warning=True)
+            CUDASetup.get_instance().add_log_entry(
+                "WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! \
+                    If you run into issues with 8-bit matmul, you can try 4-bit quantization: https://huggingface.co/blog/4bit-transformers-bitsandbytes",
+                is_warning=True,
+            )
         else:
             has_cublaslt = True
     return has_cublaslt
 
+
 def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
     return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path}
 
@@ -202,7 +253,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
             if path.exists():
                 existent_directories.add(path)
         except PermissionError as pex:
-            # Handle the PermissionError first as it is a subtype of OSError 
+            # Handle the PermissionError first as it is a subtype of OSError
             # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
             pass
         except OSError as exc:
@@ -211,8 +262,11 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
 
     non_existent_directories: Set[Path] = candidate_paths - existent_directories
     if non_existent_directories:
-        CUDASetup.get_instance().add_log_entry("The following directories listed in your path were found to "
-            f"be non-existent: {non_existent_directories}", is_warning=False)
+        CUDASetup.get_instance().add_log_entry(
+            "The following directories listed in your path were found to "
+            f"be non-existent: {non_existent_directories}",
+            is_warning=False,
+        )
 
     return existent_directories
 
@@ -238,9 +292,7 @@ def resolve_paths_list(paths_list_candidate: str) -> Set[Path]:
 
 
 def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]:
-    return get_cuda_runtime_lib_paths(
-        resolve_paths_list(paths_list_candidate)
-    )
+    return get_cuda_runtime_lib_paths(resolve_paths_list(paths_list_candidate))
 
 
 def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
@@ -248,27 +300,28 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
         warning_msg = (
             f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
             "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might missmatch with the CUDA version that is needed for bitsandbytes."
+            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
             "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
             "For example, if you want to use the CUDA version 122"
             "BNB_CUDA_VERSION=122 python ..."
             "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122"
             "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g."
-            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2")
+            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2"
+        )
         CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)
 
 
 def determine_cuda_runtime_lib_path() -> Union[Path, None]:
     """
-        Searches for a cuda installations, in the following order of priority:
-            1. active conda env
-            2. LD_LIBRARY_PATH
-            3. any other env vars, while ignoring those that
-                - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
-                - don't contain the path separator `/`
-
-        If multiple libraries are found in part 3, we optimistically try one,
-        while giving a warning message.
+    Searches for a cuda installations, in the following order of priority:
+        1. active conda env
+        2. LD_LIBRARY_PATH
+        3. any other env vars, while ignoring those that
+            - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
+            - don't contain the path separator `/`
+
+    If multiple libraries are found in part 3, we optimistically try one,
+    while giving a warning message.
     """
     candidate_env_vars = get_potentially_lib_path_containing_env_vars()
 
@@ -282,8 +335,11 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]:
         if conda_cuda_libs:
             cuda_runtime_libs.update(conda_cuda_libs)
 
-        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True)
+        CUDASetup.get_instance().add_log_entry(
+            f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
+            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
+            is_warning=True,
+        )
 
     if "LD_LIBRARY_PATH" in candidate_env_vars:
         lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"])
@@ -292,11 +348,15 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]:
             cuda_runtime_libs.update(lib_ld_cuda_libs)
         warn_in_case_of_duplicates(lib_ld_cuda_libs)
 
-        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True)
+        CUDASetup.get_instance().add_log_entry(
+            f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
+            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
+            is_warning=True,
+        )
 
     remaining_candidate_env_vars = {
-        env_var: value for env_var, value in candidate_env_vars.items()
+        env_var: value
+        for env_var, value in candidate_env_vars.items()
         if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"}
     }
 
@@ -305,13 +365,15 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]:
         cuda_runtime_libs.update(find_cuda_lib_in(value))
 
     if len(cuda_runtime_libs) == 0:
-        CUDASetup.get_instance().add_log_entry('CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...')
-        cuda_runtime_libs.update(find_cuda_lib_in('/usr/local/cuda/lib64'))
+        CUDASetup.get_instance().add_log_entry(
+            "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths..."
+        )
+        cuda_runtime_libs.update(find_cuda_lib_in("/usr/local/cuda/lib64"))
 
     warn_in_case_of_duplicates(cuda_runtime_libs)
 
     cuda_setup = CUDASetup.get_instance()
-    cuda_setup.add_log_entry(f'DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}')
+    cuda_setup.add_log_entry(f"DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}")
 
     return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None
 
@@ -321,9 +383,12 @@ def get_cuda_version():
     major, minor = map(int, torch.version.cuda.split("."))
 
     if major < 11:
-        CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
+        CUDASetup.get_instance().add_log_entry(
+            "CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!"
+        )
+
+    return f"{major}{minor}"
 
-    return f'{major}{minor}'
 
 def get_compute_capabilities():
     ccs = []
@@ -338,25 +403,34 @@ def get_compute_capabilities():
 
 def evaluate_cuda_setup():
     cuda_setup = CUDASetup.get_instance()
-    if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
-        cuda_setup.add_log_entry('')
-        cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
-        cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
-              ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
-        cuda_setup.add_log_entry('='*80)
-    if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None
-    if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None
+    if "BITSANDBYTES_NOWELCOME" not in os.environ or str(os.environ["BITSANDBYTES_NOWELCOME"]) == "0":
+        cuda_setup.add_log_entry("")
+        cuda_setup.add_log_entry("=" * 35 + "BUG REPORT" + "=" * 35)
+        cuda_setup.add_log_entry(
+            ("Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n"),
+            (
+                "and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues"
+            ),
+        )
+        cuda_setup.add_log_entry("=" * 80)
+    if not torch.cuda.is_available():
+        return "libbitsandbytes_cpu.so", None, None, None
+    if torch.version.hip:
+        return "libbitsandbytes_hip_nohipblaslt.so", None, None, None
 
     cudart_path = determine_cuda_runtime_lib_path()
     ccs = get_compute_capabilities()
     ccs.sort()
-    cc = ccs[-1] # we take the highest capability
+    cc = ccs[-1]  # we take the highest capability
     cuda_version_string = get_cuda_version()
 
-    cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.")
-    cuda_setup.add_log_entry(f"CUDA SETUP: To manually override the PyTorch CUDA version please see:"
-                             "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md")
-
+    cuda_setup.add_log_entry(
+        f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}."
+    )
+    cuda_setup.add_log_entry(
+        "CUDA SETUP: To manually override the PyTorch CUDA version please see:"
+        "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md"
+    )
 
     # 7.5 is the minimum CC vor cublaslt
     has_cublaslt = is_cublasLt_compatible(cc)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 4f7bba4ee..37728bb4a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -14,7 +14,7 @@
 
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 
-from .cextension import lib, HIP_ENVIRONMENT
+from .cextension import HIP_ENVIRONMENT, lib
 
 
 # math.prod not compatible with python < 3.8
@@ -160,7 +160,7 @@ def __init__(self):
         raise RuntimeError("Call get_instance() instead")
 
     def initialize(self):
-        #self.context = ct.c_void_p(lib.get_cusparse())
+        # self.context = ct.c_void_p(lib.get_cusparse())
         if torch.version.cuda:
             self.context = ct.c_void_p(lib.get_cusparse())
         elif torch.version.hip:
@@ -528,8 +528,8 @@ def nvidia_transform(
     ld=None,
 ):
     if HIP_ENVIRONMENT:
-        to_order = "col" if to_order in ["col32","col_turing","col_ampere"] else to_order
-        from_order = "col" if from_order in ["col32","col_turing","col_ampere"] else from_order
+        to_order = "col" if to_order in ["col32", "col_turing", "col_ampere"] else to_order
+        from_order = "col" if from_order in ["col32", "col_turing", "col_ampere"] else from_order
 
     if state is None:
         state = (A.shape, from_order)
@@ -850,7 +850,7 @@ def quantize_blockwise(
     if out is None:
         out = torch.zeros_like(A, dtype=torch.uint8)
 
-    if A.device.type != 'cpu':
+    if A.device.type != "cpu":
         if not HIP_ENVIRONMENT:
             assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
         else:
@@ -1291,7 +1291,7 @@ def dequantize_fp4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = None,
+    blocksize: Optional[int] = None,
 ) -> Tensor:
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
@@ -1304,7 +1304,7 @@ def dequantize_nf4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = None,
+    blocksize: Optional[int] = None,
 ) -> Tensor:
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
@@ -1317,7 +1317,7 @@ def dequantize_4bit(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = None,
+    blocksize: Optional[int] = None,
     quant_type="fp4",
 ) -> Tensor:
     """
@@ -1348,7 +1348,7 @@ def dequantize_4bit(
     """
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
-    
+
     supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
     if HIP_ENVIRONMENT:
         supported_blocksizes = supported_blocksizes[:-1]
@@ -2368,7 +2368,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     has_error = 0
     ptrRowScale = get_ptr(None)
     is_on_gpu([A, B, out])
-    if formatB == 'col_turing' or HIP_ENVIRONMENT:
+    if formatB == "col_turing" or HIP_ENVIRONMENT:
         if dtype == torch.int32:
             has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
         else:
@@ -2393,7 +2393,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
 
 def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None):
     if HIP_ENVIRONMENT:
-        A, quant_state = nvidia_transform(A, "row", state = quant_state)
+        A, quant_state = nvidia_transform(A, "row", state=quant_state)
     assert A.dtype == torch.int32
     if bias is not None:
         assert bias.dtype == torch.float16
@@ -2645,9 +2645,9 @@ def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None,
     return out_row, out_col, row_stats, col_stats, coo_tensor
 
 
-def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
     if HIP_ENVIRONMENT:
-        return nvidia_transform(A,to_order,from_order,out,transpose,state,ld)
+        return nvidia_transform(A, to_order, from_order, out, transpose, state, ld)
 
     prev_device = pre_call(A.device)
     if state is None:
@@ -2973,7 +2973,7 @@ def extract_outliers(A, SA, idx):
     ptrOut = get_ptr(out)
 
     prev_device = pre_call(A.device)
-    if formatA == 'col_turing' or HIP_ENVIRONMENT:
+    if formatA == "col_turing" or HIP_ENVIRONMENT:
         lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
     elif formatA == "col_ampere":
         lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad2579664..3684badf6 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -12,12 +12,11 @@
 
 import bitsandbytes as bnb
 from bitsandbytes.autograd._functions import get_tile_inds, undo_layout
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 from bitsandbytes.functional import QuantState
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import OutlierTracer
 
-from bitsandbytes.cextension import HIP_ENVIRONMENT
-
 T = TypeVar("T", bound="torch.nn.Module")
 
 
@@ -212,7 +211,7 @@ def __new__(
         data: Optional[torch.Tensor] = None,
         requires_grad=False,  # quantized weights should be frozen by default
         quant_state: Optional[QuantState] = None,
-        blocksize: int = None,
+        blocksize: Optional[int] = None,
         compress_statistics: bool = True,
         quant_type: str = "fp4",
         quant_storage: torch.dtype = torch.uint8,
@@ -221,7 +220,7 @@ def __new__(
     ) -> "Params4bit":
         if data is None:
             data = torch.empty(0)
-        
+
         if blocksize is None:
             blocksize = 64 if not HIP_ENVIRONMENT else 128
 
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index 9598bb1e3..e5655b546 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -6,9 +6,9 @@
 import torch
 
 from bitsandbytes.autograd._functions import GlobalOutlierPooler, MatmulLtState
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 import bitsandbytes.functional as F
 
-from bitsandbytes.cextension import HIP_ENVIRONMENT
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index dd7011f6b..6ff643a07 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -22,7 +22,7 @@
 
 
 // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-// Luckily we have atomicmax and atomicmin in ROCm 
+// Luckily we have atomicmax and atomicmin in ROCm
 
 __device__ float dDequantizeFP4(unsigned char val, float absmax)
 {
@@ -86,7 +86,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
         return 1.00000000f*absmax*sign; // 1011
       else
         return 0.66666667f*absmax*sign; // 1010
-    else 
+    else
       if((val & 0b0001) == 1) // 100
         return 5.208333333e-03f*absmax*sign; // 1001
       else
@@ -110,7 +110,7 @@ __device__ unsigned char dQuantizeFP4(float x)
 
   // we do a binary search
   // the pivots are divided by 12 (the FP4 absmax)
-  // since we assum input data is in [-1.0, 1.0]
+  // since we assume input data is in [-1.0, 1.0]
 
   // !be careful here, its easy to make a mistake
   // that is difficult to noice if you add an extra
@@ -150,36 +150,36 @@ __device__ half dhDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -187,12 +187,12 @@ __device__ half dhDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -205,36 +205,36 @@ __device__ float dDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -242,12 +242,12 @@ __device__ float dDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -1841,7 +1841,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
               //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
               //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
               g_val *= gnorm_scale;
-              
+
 							s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
 
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
@@ -2237,8 +2237,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   }
 
   // 4. store data via atomicMax
-  // to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
-  // into a striped arangement: [0, 8, 16, 24, ..] for t0
+  // to store col data efficiently we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
+  // into a striped arrangement: [0, 8, 16, 24, ..] for t0
   __syncthreads();
   BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);
 
@@ -2288,7 +2288,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 
   // data is in 32 column-tile major with tile width 32 columns and numRows rows
   // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
-  // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+  // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
   // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
   // C2. Compute normalization values and store col values in register
   // S1. Store C1 into 16-bit output
@@ -2367,7 +2367,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
   #pragma unroll ITEMS_PER_THREAD
   for(int j = 0; j < ITEMS_PER_THREAD; j++)
     local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*rowStat[j]*colStat[j]) + local_biasValue[j]);
- 
+
   // each block processes SUBTILE_ROWS*32 elements
   #pragma unroll ITEMS_PER_THREAD
   for(int j = 0; j < ITEMS_PER_THREAD; j++)
@@ -2390,14 +2390,14 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
       break;
 
-    // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+    // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
     LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
     ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);
 
     #pragma unroll ITEMS_PER_THREAD
     for(int j = 0; j < ITEMS_PER_THREAD; j++)
       local_rowStats[j] = smem_rowStats[subtile_base_row+row_offset+j];
-	
+
     #pragma unroll ITEMS_PER_THREAD
     for(int j = 0; j < ITEMS_PER_THREAD; j++)
       local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*local_rowStats[j]*colStat) + local_biasValue);
@@ -2657,7 +2657,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                   // row1 [col0 col1 ... col31]
                   // ...
                   //
-                  // As such we read consequtive entries with 256 threads (8rows x 32 columns)
+                  // As such we read consecutive entries with 256 threads (8rows x 32 columns)
                   // as j increase, the row increase by a factor of 8
                   // We load 8 rows per subrow loop, and subrow increase by 8 per loop
                   // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
@@ -2754,7 +2754,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                     // each of these has 32 values in total for 32*4 = 128 as offset if odd
                     // every set of 4 columns increases the total offset by 16
                     // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
-                    // this happends every 8 rows anew (subrow % 8)
+                    // this happens every 8 rows anew (subrow % 8)
                     // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
                     int subcol = warp_lane;
 
@@ -3066,7 +3066,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 	{
 		//col-major offset
 		int offset = local_colidx * rowsA + row;
-	
+
 		char val = A[offset];
 		int out_idx = (row*idx_size) + blockIdx.x;
 		out[out_idx] = val;
@@ -3087,11 +3087,11 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// use k warps per thread block
 //// 1. threadblock use read-only cache to read in register tile for A into shared memory
 //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
-//// 3. each warp reads a segment of values 16x32 from B 
+//// 3. each warp reads a segment of values 16x32 from B
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memroy block C
+//// 7. aggregate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}
@@ -3549,7 +3549,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
 {
 
-  // per threadblock: 
+  // per threadblock:
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1x32 * 32x4 -> 1x4 outputs per thread block
@@ -3782,7 +3782,7 @@ template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long
   {
     switch(FUNC)
     {
-      case FILL: 
+      case FILL:
         A[i] = (T)value;
         break;
       case ARANGE:
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index f03636e47..be6abc070 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -429,7 +429,7 @@ extern "C"
 	{ \
 		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
 	} \
-	
+
 #endif
 
 #if BUILD_HIP
@@ -572,7 +572,7 @@ extern "C"
 		int hasPrefetch = 0;
 		CUDA_CHECK_RETURN(hipDeviceGetAttribute(&hasPrefetch, hipDeviceAttributeConcurrentManagedAccess, device)); // 40ns overhead
 		if (hasPrefetch == 0) return;
- 
+
 		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(hipPeekAtLastError());
 	}
diff --git a/install_cuda.py b/install_cuda.py
index a5d09356d..cf7c8ee71 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -77,9 +77,7 @@ def main():
     download_path = "/tmp"  # default download path
 
     if len(sys.argv) < 2:
-        print(
-            "Usage: python install_cuda.py <version/all> [user/system] [download_path]"
-        )
+        print("Usage: python install_cuda.py <version/all> [user/system] [download_path]")
         sys.exit(1)
 
     version = sys.argv[1]
@@ -100,9 +98,7 @@ def main():
     elif version in cuda_versions:
         install_cuda(version, base_path, download_path)
     else:
-        print(
-            f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}"
-        )
+        print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}")
         sys.exit(1)
 
 
diff --git a/tests/helpers.py b/tests/helpers.py
index fc7ce1acb..e93c11b70 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -63,9 +63,9 @@ def id_formatter(label: str):
 def describe_dtype(dtype: torch.dtype) -> str:
     return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
 
+
 def get_blocksizes(hip_env: bool) -> List[int]:
     if not hip_env:
         return [4096, 2048, 1024, 512, 256, 128, 64]
     else:
         return [4096, 2048, 1024, 512, 256, 128]
-
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 8c9acb31d..9da665a2d 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -4,8 +4,6 @@
 import torch
 
 import bitsandbytes as bnb
-from bitsandbytes.cextension import HIP_ENVIRONMENT
-
 from tests.helpers import (
     BOOLEAN_TRIPLES,
     BOOLEAN_TUPLES,
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index e01a15b94..53dd25044 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -1,6 +1,6 @@
 import pytest
 
-from bitsandbytes.cextension import get_cuda_bnb_library_path, HIP_ENVIRONMENT
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.cuda_specs import CUDASpecs
 
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
index a729ecebe..0f817d1dc 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -12,14 +12,7 @@
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
 from bitsandbytes.cextension import HIP_ENVIRONMENT
-from tests.helpers import (
-    BOOLEAN_TUPLES,
-    TRUE_FALSE,
-    describe_dtype,
-    get_test_dims,
-    id_formatter,
-    get_blocksizes
-)
+from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
 k = 20
@@ -115,6 +108,7 @@ def test_estimate_quantiles(dtype):
     diff = torch.abs(code - quantiles)
     assert (diff > 5e-02).sum().item() == 0
 
+
 def test_quantile_quantization():
     for i in range(100):
         A1 = torch.randn(1024, 1024, device="cuda")
@@ -516,7 +510,9 @@ def test_vector_quant(dim1, dim2, dim3):
 @pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
 @pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
 @pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
-@pytest.mark.parametrize("orderOut", ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize(
+    "orderOut", ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"], ids=id_formatter("orderOut")
+)
 @pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
 @pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
@@ -2058,7 +2054,9 @@ def test_normal_map_tree():
         # print(pivots)
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64")
+@pytest.mark.skipif(
+    HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
+)
 @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
 @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
diff --git a/tests/test_generation.py b/tests/test_generation.py
index 20490ea33..8e689261b 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -4,10 +4,8 @@
 import pytest
 import torch
 
-from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter
-
-import bitsandbytes as bnb
 from bitsandbytes.cextension import HIP_ENVIRONMENT
+from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter
 
 transformers = pytest.importorskip("transformers")
 
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index cef765dad..ca52f312e 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -8,8 +8,8 @@
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
 from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout
-from bitsandbytes.nn.modules import Linear8bitLt
 from bitsandbytes.cextension import HIP_ENVIRONMENT
+from bitsandbytes.nn.modules import Linear8bitLt
 from tests.helpers import (
     TRUE_FALSE,
     id_formatter,
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 362f037f1..d8c46e415 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -10,7 +10,6 @@
 
 import bitsandbytes as bnb
 import bitsandbytes.functional as F
-from bitsandbytes.cextension import HIP_ENVIRONMENT
 from tests.helpers import describe_dtype, id_formatter
 
 # import apex
diff --git a/tests/test_triton.py b/tests/test_triton.py
index 8d9e15c4d..1c5422c0d 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -1,9 +1,8 @@
 import pytest
 import torch
 
-from bitsandbytes.nn import Linear8bitLt
 from bitsandbytes.cextension import HIP_ENVIRONMENT
-
+from bitsandbytes.nn import Linear8bitLt
 from bitsandbytes.nn.triton_based_modules import SwitchBackLinear
 from bitsandbytes.triton.triton_utils import is_triton_available
 from tests.helpers import TRUE_FALSE

From 8c23dc0100e1d610cedb6ea13d6489d20690974f Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Tue, 9 Apr 2024 22:47:05 +0000
Subject: [PATCH 102/234] Fix typos

---
 bitsandbytes/cuda_setup/main.py | 2 +-
 csrc/kernels.hip                | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index b0a790e70..b2f9214a4 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -4,7 +4,7 @@
     [ ] TODO: Q - What if we have multiple GPUs of different makes?
 - CUDA version
 - Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiplication)
     - CuBLAS-LT: full-build 8-bit optimizer
     - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
 
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 6ff643a07..ca77dceda 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -113,7 +113,7 @@ __device__ unsigned char dQuantizeFP4(float x)
   // since we assume input data is in [-1.0, 1.0]
 
   // !be careful here, its easy to make a mistake
-  // that is difficult to noice if you add an extra
+  // that is difficult to notice if you add an extra
   // zero somewhere!
 
   int sign = x < 0 ? 0b1000 : 0b0000;
@@ -2118,7 +2118,7 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   // 0. reset stats to -FLT_MAX
   // 1. load row-by-row ITEMS_PER_THREAD (TILE_SIZE==THREADS*ITEMS_PER_THREAD)
   // 2. compute col max (per thread); store in smem due to register pressure
-  // 3. compute row max (per block); store in smem to accumulate full global mem transation
+  // 3. compute row max (per block); store in smem to accumulate full global mem transaction
   // 4. store data via atomicMax
 
   // each block loads TILE_COLs columns and TILE_ROW rows
@@ -2207,7 +2207,7 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
       //smem_col_absmax_values[threadIdx.x + (j*THREADS)] = fmaxf(smem_col_absmax_values[threadIdx.x + (j*THREADS)], __half2float(local_data[j]));
       local_col_absmax_values[j] = fmaxf(local_col_absmax_values[j], __half2float(local_data[j]));
 
-    // 3. compute row max (per block); store in smem to accumulate full global mem transation
+    // 3. compute row max (per block); store in smem to accumulate full global mem transaction
 
     // this is slow as it uses extra registers, but we need this to be compatible with Kepler and Maxwell (no fp16 units)
     #pragma unroll ITEMS_PER_THREAD

From 4d6408a69c1a3ea7f4f1a9bde1fe89337612c39e Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Tue, 9 Apr 2024 22:16:13 -0500
Subject: [PATCH 103/234] Fix formatting in README file

---
 README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9a741d22f..415679df9 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,9 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
 The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
 **Installation for ROCm:**
+
 To install latest bitsandbytes (supported on ROCm 6.2):
+```bash
 git clone --recurse https://github.com/ROCm/bitsandbytes
 cd bitsandbytes
 git checkout rocm_enabled
@@ -15,16 +17,20 @@ pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=hip -S .
 make
 pip install .
+```
 
 For ROCm specific versions:
+
 Install Dependencies:
-#hipblaslt installation needed only for rocm<6.0
+```bash
+# hipblaslt installation needed only for rocm<6.0
 apt install hipblaslt
 pip install --upgrade pip
 pip install einops lion_pytorch accelerate
 pip install git+https://github.com/ROCm/transformers.git
-
+```
 Install Bitsandbytes:
+```bash
 git clone --recurse https://github.com/ROCm/bitsandbytes
 cd bitsandbytes
 # Checkout branch as needed
@@ -33,6 +39,7 @@ cd bitsandbytes
 git checkout <branch>
 make hip
 python setup.py install
+```
 
 **For more details, please head to the official documentation page:**
 

From d62516f290fb529a69bc2fda767b2a87bfd9d72f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 10 Apr 2024 22:10:53 -0400
Subject: [PATCH 104/234] (backends) Stub out additional backends; move more
 functions to backend interface

---
 bitsandbytes/__init__.py      |  46 ++++-
 bitsandbytes/backends/base.py | 185 +++++++++++++++---
 bitsandbytes/backends/cpu.py  | 164 ++++++++++++++++
 bitsandbytes/backends/cuda.py | 343 +++++++++++++++++++++++++++++++++-
 bitsandbytes/backends/mps.py  | 164 ++++++++++++++++
 bitsandbytes/backends/rocm.py |  12 ++
 bitsandbytes/backends/xpu.py  | 164 ++++++++++++++++
 bitsandbytes/functional.py    | 303 +++++-------------------------
 8 files changed, 1085 insertions(+), 296 deletions(-)
 create mode 100644 bitsandbytes/backends/cpu.py
 create mode 100644 bitsandbytes/backends/mps.py
 create mode 100644 bitsandbytes/backends/rocm.py
 create mode 100644 bitsandbytes/backends/xpu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 019a4f6ab..fcc31b220 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
 from . import research, utils
 from .autograd._functions import (
     MatmulLtState,
@@ -12,15 +14,49 @@
     matmul_cublas,
     mm_cublas,
 )
+from .backends import register_backend
+from .backends.cpu import CPUBackend
 from .cextension import lib
 from .nn import modules
 
-if lib and lib.compiled_with_cuda:
-    from .backends import register_backend
-    from .backends.cuda import CUDABackend
-    from .optim import adam
+# Always register the CPU backend.
+register_backend("cpu", CPUBackend())
+
+# Register either CUDA or ROCm backend, if available.
+# Only one of these backends can be used at a time, since the torch.device semantics are
+# the same for both torch+rocm and torch+cuda (e.g. device name is "cuda")
+if torch.cuda.is_available():
+    # TODO: Consider deferring loading of cextension - should backend class implement that?
+
+    if torch.version.cuda:
+        from .backends.cuda import CUDABackend
+
+        register_backend("cuda", CUDABackend())
+    elif torch.version.hip:
+        from .backends.rocm import ROCmBackend
+
+        register_backend("cuda", ROCmBackend())
+
+# Register MPS backend, if available.
+if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    from .backends.mps import MPSBackend
+
+    register_backend("mps", MPSBackend())
+
+# Register Intel XPU backend, if available.
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .backends.xpu import XPUBackend
+
+    register_backend("xpu", XPUBackend())
+
+# TODO: Other potential backends:
+# XLA - Google TPU / PJRT runtime
+# HPU - Habana / Intel Gaudi
+# IPU - Graphcore
+# NPU - Ascend
+# Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
+# In this case, it will be up to each backend to dispatch as needed
 
-    register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
index 8232d17c1..2e73c3d6e 100644
--- a/bitsandbytes/backends/base.py
+++ b/bitsandbytes/backends/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple, Union
 
 import torch
 
@@ -12,11 +12,11 @@ class Backend(ABC):
     @abstractmethod
     def double_quant(
         self,
-        A,
-        col_stats=None,
-        row_stats=None,
-        out_col=None,
-        out_row=None,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ):
         raise NotImplementedError
@@ -24,36 +24,50 @@ def double_quant(
     @abstractmethod
     def transform(
         self,
-        A,
-        to_order,
+        A: torch.Tensor,
+        to_order: str,
         from_order="row",
-        out=None,
+        out: Optional[torch.Tensor] = None,
         transpose=False,
-        state=None,
+        state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
         raise NotImplementedError
 
     @abstractmethod
-    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
         raise NotImplementedError
 
     @abstractmethod
     def mm_dequant(
         self,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None,
-    ):
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     @abstractmethod
-    def extract_outliers(self, A, SA, idx):
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     @abstractmethod
@@ -64,7 +78,7 @@ def quantize_4bit(
         out: Optional[torch.Tensor] = None,
         blocksize=64,
         compress_statistics=False,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         """
@@ -102,7 +116,7 @@ def dequantize_4bit(
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         blocksize: int = 64,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
         """
         Dequantizes FP4 blockwise quantized values.
@@ -131,3 +145,128 @@ def dequantize_4bit(
             Dequantized tensor.
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        """
+        Performs an in-place optimizer update with one or two optimizer states.
+
+        Args:
+            optimizer_name (`str`): The name of the optimizer, e.g. `adam`
+            g (`torch.Tensor`): Gradient tensor.
+            p (`torch.Tensor`): Parameter tensor.
+            state1 (`torch.Tensor`): Optimizer state 1.
+            state2 (`torch.Tensor`, optional): Optimizer state 2.
+            beta1 (`float`): Optimizer beta1.
+            beta2 (`float`): Optimizer beta2.
+            eps (`float`): Optimizer epsilon.
+            step (`int`): Current optimizer step.
+            lr (`float`): The learning rate.
+            qmap1 (`torch.Tensor`): Quantization map for the first state.
+            qmap2 (`torch.Tensor`, optional): Quantization map for the second state.
+            absmax1 (`torch.Tensor`): Max value for the first state update.
+            absmax2 (`torch.Tensor`, optional): Max value for the second state update.
+            weight_decay (`float`, optional): Weight decay. Defaults to 0.0.
+            gnorm_scale (`float`, optional): The factor to rescale the gradient to the max clip value. Defaults to 1.0.
+            skip_zeros (`bool`, optional): Whether to skip zero-valued gradients or not. Defaults to False.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        """
+        Performs an in-place optimizer update with one or two optimizer states.
+
+        Universal optimizer update for 32-bit state and 32/16-bit gradients/weights
+
+        Args:
+            optimizer_name (`str`): The name of the optimizer, e.g. `adam`
+            g (`torch.Tensor`): Gradient tensor.
+            p (`torch.Tensor`): Parameter tensor.
+            state1 (`torch.Tensor`): Optimizer state 1.
+            beta1 (`float`): Optimizer beta1.
+            eps (`float`): Optimizer epsilon.
+            step (`int`): Current optimizer step.
+            lr (`float`): The learning rate.
+            state2 (`torch.Tensor`, optional): Optimizer state 2. Defaults to None.
+            beta2 (`float`, optional): Optimizer beta2. Defaults to 0.0.
+            weight_decay (`float`, optional): Defaults to 0.0.
+            gnorm_scale (`float`, optional): The factor to rescale the gradient to the max clip value. Defaults to 1.0.
+            unorm_vec (`torch.Tensor`, optional): The tensor for the update norm. Defaults to None.
+            max_unorm (`float`, optional): The maximum update norm relative to the weight norm.. Defaults to 0.0.
+            skip_zeros (`bool`, optional): Whether to skip zero-valued gradients or not. Defaults to False.
+        """
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
new file mode 100644
index 000000000..830ebfadd
--- /dev/null
+++ b/bitsandbytes/backends/cpu.py
@@ -0,0 +1,164 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+class CPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index c76bcaebd..93755b05f 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -1,5 +1,5 @@
 import ctypes as ct
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple
 
 import torch
 
@@ -23,9 +23,69 @@
 
 from .base import Backend
 
+if lib and lib.compiled_with_cuda:
+    """C FUNCTIONS FOR OPTIMIZERS"""
+    str2optimizer32bit = {
+        "adam": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
+        ),
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
+    }
+
+    str2optimizer8bit_blockwise = {
+        "adam": (
+            lib.cadam_8bit_blockwise_grad_fp32,
+            lib.cadam_8bit_blockwise_grad_fp16,
+            lib.cadam_8bit_blockwise_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum_8bit_blockwise_grad_fp32,
+            lib.cmomentum_8bit_blockwise_grad_fp16,
+        ),
+        "rmsprop": (
+            lib.crmsprop_8bit_blockwise_grad_fp32,
+            lib.crmsprop_8bit_blockwise_grad_fp16,
+        ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
+    }
+
 
 class CUDABackend(Backend):
-    def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
         device = A.device
         assert A.dtype == torch.half
         assert device.type == "cuda"
@@ -114,7 +174,16 @@ def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=
 
         return out_row, out_col, row_stats, col_stats, coo_tensor
 
-    def transform(self, A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
         prev_device = pre_call(A.device)
         if state is None:
             state = (A.shape, from_order)
@@ -166,7 +235,16 @@ def transform(self, A, to_order, from_order="row", out=None, transpose=False, st
 
         return out, new_state
 
-    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ):
         shapeA = SA[0]
         shapeB = SB[0]
         dimsA = len(shapeA)
@@ -260,7 +338,15 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         return out, Sout
 
     def mm_dequant(
-        self, A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
     ):
         assert A.dtype == torch.int32
         if bias is not None:
@@ -297,7 +383,7 @@ def mm_dequant(
 
         return out
 
-    def extract_outliers(self, A, SA, idx):
+    def extract_outliers(self, A: torch.Tensor, SA: Tuple[torch.Size, str], idx: torch.Tensor):
         shapeA = SA[0]
         formatA = SA[1]
         assert formatA in ["col_turing", "col_ampere"]
@@ -330,7 +416,7 @@ def quantize_4bit(
         out: Optional[torch.Tensor] = None,
         blocksize=64,
         compress_statistics=False,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         if A.device.type != "cuda":
@@ -395,7 +481,7 @@ def quantize_4bit(
         if compress_statistics:
             offset = absmax.mean()
             absmax -= offset
-            qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+            qabsmax, state2 = self.quantize_blockwise(absmax, blocksize=256)
             del absmax
             state = QuantState(
                 absmax=qabsmax,
@@ -422,7 +508,7 @@ def dequantize_4bit(
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         blocksize: int = 64,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
         if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
             raise ValueError(
@@ -442,7 +528,7 @@ def dequantize_4bit(
             absmax = quant_state.absmax
 
         if quant_state.nested:
-            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+            absmax = self.dequantize_blockwise(quant_state.absmax, quant_state.state2)
             absmax += quant_state.offset
             if absmax.dtype != torch.float32:
                 absmax = absmax.float()
@@ -526,3 +612,240 @@ def dequantize_4bit(
             return out.t()
         else:
             return out
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ):
+        prev_device = pre_call(A.device)
+
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+
+        if A.numel() != A.shape[-1]:
+            raise ValueError(
+                'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]',
+            )
+
+        Bshape = state.shape
+        bout = Bshape[0]
+        absmax = state.absmax
+        if state.nested:
+            absmax = self.dequantize_blockwise(state.absmax, state.state2)
+            absmax += state.offset
+
+        if out is None:
+            if len(A.shape) == 3:
+                out = torch.empty(size=(A.shape[0], A.shape[1], bout), dtype=A.dtype, device=A.device)
+            else:
+                out = torch.empty(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
+
+        n = 1
+        m = Bshape[0]
+        k = Bshape[1]
+        lda = Bshape[0]
+        ldc = Bshape[0]
+        ldb = (A.shape[-1] + 1) // 2
+        is_on_gpu([B, A, out, absmax, state.code])
+        m = ct.c_int32(m)
+        n = ct.c_int32(n)
+        k = ct.c_int32(k)
+        lda = ct.c_int32(lda)
+        ldb = ct.c_int32(ldb)
+        ldc = ct.c_int32(ldc)
+
+        inference_args = [
+            m,
+            n,
+            k,
+            get_ptr(A),
+            get_ptr(B),
+            get_ptr(absmax),
+            get_ptr(state.code),
+            get_ptr(out),
+            lda,
+            ldb,
+            ldc,
+            ct.c_int32(state.blocksize),
+        ]
+
+        if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
+            if A.dtype == torch.float16:
+                lib.cgemm_4bit_inference_naive_fp16(*inference_args)
+            elif A.dtype == torch.bfloat16:
+                lib.cgemm_4bit_inference_naive_bf16(*inference_args)
+            elif A.dtype == torch.float32:
+                lib.cgemm_4bit_inference_naive_fp32(*inference_args)
+            else:
+                raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
+
+        else:
+            raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
+
+        post_call(prev_device)
+
+        return out
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        # TODO: Move from bnb.functional
+        return dequantize_blockwise(
+            A,
+            quant_state=quant_state,
+            absmax=absmax,
+            code=code,
+            out=out,
+            blocksize=blocksize,
+            nested=nested,
+        )
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        # TODO: Move from bnb.functional
+        return quantize_blockwise(
+            A,
+            absmax=absmax,
+            code=code,
+            out=out,
+            blocksize=blocksize,
+            nested=nested,
+        )
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        optim_func = None
+        prev_device = pre_call(g.device)
+        is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
+        if g.dtype == torch.float32 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
+        elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
+        elif (
+            g.dtype == torch.bfloat16
+            and state1.dtype == torch.uint8
+            and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
+        ):
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
+        else:
+            raise ValueError(
+                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
+            )
+        post_call(prev_device)
+
+        is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
+
+        prev_device = pre_call(g.device)
+        optim_func(
+            get_ptr(p),
+            get_ptr(g),
+            get_ptr(state1),
+            get_ptr(state2),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            get_ptr(qmap1),
+            get_ptr(qmap2),
+            get_ptr(absmax1),
+            get_ptr(absmax2),
+            ct.c_float(weight_decay),
+            ct.c_float(gnorm_scale),
+            ct.c_bool(skip_zeros),
+            ct.c_int32(g.numel()),
+        )
+        post_call(prev_device)
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        param_norm = 0.0
+        if max_unorm > 0.0:
+            param_norm = torch.norm(p.data.float())
+
+        optim_func = None
+        if g.dtype == torch.float32:
+            optim_func = str2optimizer32bit[optimizer_name][0]
+        elif g.dtype == torch.float16:
+            optim_func = str2optimizer32bit[optimizer_name][1]
+        elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
+            optim_func = str2optimizer32bit[optimizer_name][2]
+        else:
+            raise ValueError(
+                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
+            )
+
+        is_on_gpu([g, p, state1, state2, unorm_vec])
+        prev_device = pre_call(g.device)
+        optim_func(
+            get_ptr(g),
+            get_ptr(p),
+            get_ptr(state1),
+            get_ptr(state2),
+            get_ptr(unorm_vec),
+            ct.c_float(max_unorm),
+            ct.c_float(param_norm),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_float(weight_decay),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            ct.c_float(gnorm_scale),
+            ct.c_bool(skip_zeros),
+            ct.c_int32(g.numel()),
+        )
+        post_call(prev_device)
diff --git a/bitsandbytes/backends/mps.py b/bitsandbytes/backends/mps.py
new file mode 100644
index 000000000..5b7eda0c7
--- /dev/null
+++ b/bitsandbytes/backends/mps.py
@@ -0,0 +1,164 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+class MPSBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/rocm.py b/bitsandbytes/backends/rocm.py
new file mode 100644
index 000000000..d74f10ead
--- /dev/null
+++ b/bitsandbytes/backends/rocm.py
@@ -0,0 +1,12 @@
+from .cuda import CUDABackend
+
+
+class ROCmBackend(CUDABackend):
+    """
+    Backend for AMD ROCm implementation.
+
+    The interface is largely the same as the CUDA implementation, so only any
+    differences need to be implemented here.
+    """
+
+    pass
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
new file mode 100644
index 000000000..3976c4d5a
--- /dev/null
+++ b/bitsandbytes/backends/xpu.py
@@ -0,0 +1,164 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+class XPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6bb02944d..a180cf0ce 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,31 +27,6 @@ def prod(iterable):
 
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
-    str2optimizer32bit = {
-        "adam": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            lib.cadam32bit_grad_bf16,
-        ),
-        "momentum": (
-            lib.cmomentum32bit_grad_32,
-            lib.cmomentum32bit_grad_16,
-        ),
-        "rmsprop": (
-            lib.crmsprop32bit_grad_32,
-            lib.crmsprop32bit_grad_16,
-        ),
-        "lion": (
-            lib.clion32bit_grad_fp32,
-            lib.clion32bit_grad_fp16,
-            lib.clion32bit_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
-        ),
-    }
-
     str2optimizer8bit = {
         "adam": (
             lib.cadam_static_8bit_grad_32,
@@ -79,31 +54,6 @@ def prod(iterable):
         ),
     }
 
-    str2optimizer8bit_blockwise = {
-        "adam": (
-            lib.cadam_8bit_blockwise_grad_fp32,
-            lib.cadam_8bit_blockwise_grad_fp16,
-            lib.cadam_8bit_blockwise_grad_bf16,
-        ),
-        "momentum": (
-            lib.cmomentum_8bit_blockwise_grad_fp32,
-            lib.cmomentum_8bit_blockwise_grad_fp16,
-        ),
-        "rmsprop": (
-            lib.crmsprop_8bit_blockwise_grad_fp32,
-            lib.crmsprop_8bit_blockwise_grad_fp16,
-        ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
-    }
-
 
 class GlobalPageManager:
     _instance = None
@@ -1167,82 +1117,24 @@ def optimizer_update_32bit(
     max_unorm: float = 0.0,
     skip_zeros=False,
 ) -> None:
-    """
-    Performs an inplace optimizer update with one or two optimizer states.
-
-    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer: {adam}.
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Optimizer state 1.
-    beta1 : float
-        Optimizer beta1.
-    eps : float
-        Optimizer epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    state2 : torch.Tensor
-        Optimizer state 2.
-    beta2 : float
-        Optimizer beta2.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    skip_zeros : bool
-        Whether to skip zero-valued gradients or not (default: False).
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    optim_func = None
-    if g.dtype == torch.float32:
-        optim_func = str2optimizer32bit[optimizer_name][0]
-    elif g.dtype == torch.float16:
-        optim_func = str2optimizer32bit[optimizer_name][1]
-    elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
-        optim_func = str2optimizer32bit[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
-        )
-
-    is_on_gpu([g, p, state1, state2, unorm_vec])
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(g),
-        get_ptr(p),
-        get_ptr(state1),
-        get_ptr(state2),
-        get_ptr(unorm_vec),
-        ct.c_float(max_unorm),
-        ct.c_float(param_norm),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_float(weight_decay),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
+    ensure_backend_is_available(g.device.type)
+    return backends[g.device.type].optimizer_update_32bit(
+        optimizer_name=optimizer_name,
+        g=g,
+        p=p,
+        state1=state1,
+        beta1=beta1,
+        eps=eps,
+        step=step,
+        lr=lr,
+        state2=state2,
+        beta2=beta2,
+        weight_decay=weight_decay,
+        gnorm_scale=gnorm_scale,
+        unorm_vec=unorm_vec,
+        max_unorm=max_unorm,
+        skip_zeros=skip_zeros,
     )
-    post_call(prev_device)
 
 
 def optimizer_update_8bit(
@@ -1397,48 +1289,26 @@ def optimizer_update_8bit_blockwise(
     gnorm_scale: float = 1.0,
     skip_zeros=False,
 ) -> None:
-    optim_func = None
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
-    elif (
-        g.dtype == torch.bfloat16
-        and state1.dtype == torch.uint8
-        and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
-    ):
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
-        )
-    post_call(prev_device)
-
-    is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
-
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(p),
-        get_ptr(g),
-        get_ptr(state1),
-        get_ptr(state2),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        get_ptr(qmap1),
-        get_ptr(qmap2),
-        get_ptr(absmax1),
-        get_ptr(absmax2),
-        ct.c_float(weight_decay),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
+    ensure_backend_is_available(g.device.type)
+    return backends[g.device.type].optimizer_update_8bit_blockwise(
+        optimizer_name=optimizer_name,
+        g=g,
+        p=p,
+        state1=state1,
+        state2=state2,
+        beta1=beta1,
+        beta2=beta2,
+        eps=eps,
+        step=step,
+        lr=lr,
+        qmap1=qmap1,
+        qmap2=qmap2,
+        absmax1=absmax1,
+        absmax2=absmax2,
+        weight_decay=weight_decay,
+        gnorm_scale=gnorm_scale,
+        skip_zeros=skip_zeros,
     )
-    post_call(prev_device)
 
 
 def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
@@ -1593,98 +1463,15 @@ def gemv_4bit(
     transposed_B=False,
     state=None,
 ):
-    prev_device = pre_call(A.device)
-    # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
-    if state is None:
-        raise ValueError("state cannot None. gem_4bit( ) requires the state from quantize_4bit( )")
-
-    if A.numel() != A.shape[-1]:
-        raise ValueError(
-            'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]',
-        )
-
-    Bshape = state.shape
-    bout = Bshape[0]
-    absmax = state.absmax
-    if state.nested:
-        absmax = dequantize_blockwise(state.absmax, state.state2)
-        absmax += state.offset
-
-    if out is None:
-        if len(A.shape) == 3:
-            out = torch.empty(size=(A.shape[0], A.shape[1], bout), dtype=A.dtype, device=A.device)
-        else:
-            out = torch.empty(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
-
-    n = 1
-    m = Bshape[0]
-    k = Bshape[1]
-    lda = Bshape[0]
-    ldc = Bshape[0]
-    ldb = (A.shape[-1] + 1) // 2
-    is_on_gpu([B, A, out, absmax, state.code])
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-    lda = ct.c_int32(lda)
-    ldb = ct.c_int32(ldb)
-    ldc = ct.c_int32(ldc)
-
-    if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
-        if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        elif A.dtype == torch.bfloat16:
-            lib.cgemm_4bit_inference_naive_bf16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        elif A.dtype == torch.float32:
-            lib.cgemm_4bit_inference_naive_fp32(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        else:
-            raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    else:
-        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    post_call(prev_device)
-
-    return out
+    ensure_backend_is_available(A.device.type)
+    return backends[A.device.type].gemv_4bit(
+        A,
+        B,
+        out=out,
+        transposed_A=transposed_A,
+        transposed_B=transposed_B,
+        state=state,
+    )
 
 
 def igemm(

From 13ad630ccce253ea805b70dd712000787f5b9f4f Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Thu, 11 Apr 2024 05:20:56 -0700
Subject: [PATCH 105/234] Add int8 ops for Intel CPU & XPU

---
 bitsandbytes/__init__.py                   |  12 +-
 bitsandbytes/autograd/_functions.py        |  13 +-
 bitsandbytes/backends/cpu.py               | 287 +++++++++++++++++++++
 bitsandbytes/backends/xpu.py               | 118 +++++++++
 bitsandbytes/functional.py                 |   5 +-
 bitsandbytes/nn/modules.py                 |  38 +++
 examples/int8_inference_huggingface_cpu.py |  32 +++
 7 files changed, 499 insertions(+), 6 deletions(-)
 create mode 100644 bitsandbytes/backends/cpu.py
 create mode 100644 bitsandbytes/backends/xpu.py
 create mode 100644 examples/int8_inference_huggingface_cpu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 019a4f6ab..0dae37e8d 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
 from . import research, utils
 from .autograd._functions import (
     MatmulLtState,
@@ -14,13 +15,22 @@
 )
 from .cextension import lib
 from .nn import modules
+from .backends import register_backend
 
 if lib and lib.compiled_with_cuda:
-    from .backends import register_backend
     from .backends.cuda import CUDABackend
     from .optim import adam
 
     register_backend("cuda", CUDABackend())
+
+elif torch.xpu.is_available():
+    from .backends.xpu import XPUBackend
+    register_backend("xpu", XPUBackend)
+
+else:
+    from .backends.cpu import CPUBackend
+    register_backend("cpu", CPUBackend)
+
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e9821cd36..67b8b6b87 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -217,6 +217,8 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
+    if device == torch.device('cpu'):
+        return True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)
@@ -312,13 +314,16 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
-        if A.dtype != torch.float16:
-            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
+        A_dtype = torch.float16
+        if A.device == torch.device('cpu'):
+            A_dtype = torch.bfloat16
+        if A.dtype != A_dtype:
+            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
             A = A.reshape(-1, A.shape[-1])
-        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold)
+        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(A_dtype), threshold=state.threshold)
 
         if state.threshold > 0.0 and coo_tensorA is not None:
             if state.has_fp16_weights:
@@ -393,7 +398,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if using_igemmlt:
             C32A, SA = F.transform(CA, "col32")
             out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
-            if bias is None or bias.dtype == torch.float16:
+            if bias is None or bias.dtype == A_dtype:
                 # we apply the fused bias here
                 output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
                 output = output.to(A.dtype)
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
new file mode 100644
index 000000000..31bb52945
--- /dev/null
+++ b/bitsandbytes/backends/cpu.py
@@ -0,0 +1,287 @@
+import torch
+
+
+Tensor = torch.Tensor
+
+
+def assert_on_cpu(tensors):
+    on_cpu = True
+    for t in tensors:
+        if t is None: continue # NULL pointers are fine
+        on_cpu &= (t.device.type == 'cpu')
+    if not on_cpu:
+        raise TypeError(
+            'All input tensors need to be on CPU, but found some tensors to not be on CPU:\n' \
+            f' {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}'
+        )
+    return on_cpu
+
+
+@torch.compile(dynamic=True, options={"fx_graph_cache": True})
+def double_quant_common(
+    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+):
+    """
+    Find absolute max valus of each row/column of a tensor, and symmetrically quantize it to int8.
+    If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
+    the original tensor and they are kept in COO format: (rows, cols, valus)
+    If threashold == 0.0, there are no outliers.
+    Args:
+        A The tensor to be analyzed and quantized.
+        col_stats Absolute max values of each column of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        row_stats Absolute max values of each row of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        out_col Output buffer for the result quantized per column if it is not None
+        out_row Output buffer for the result quantized per row if it is not None
+        threshold The threshold for finding outliers if it is > 0.0. Otherwise it has no effect.
+    Return:
+        A tuple of output quantized per row, output quantized per column, absolute max values of
+        each row of A, absolute max values of each column of A, outliers in COO format
+    """
+    from ..functional import COOSparseTensor
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        assert A.dim() == 2, f"double_quant: Input tensor should be 2d or 3d but got {A.dim()}d"
+        rows = A.shape[0]
+    A = A.reshape(rows, cols)
+
+    coo_tensor = None
+
+    def get_row_col_stats(A):
+        row_stats = torch.max(torch.abs(A), 1).values # absolute max of each row
+        col_stats = torch.max(torch.abs(A), 0).values # absolute max of each col
+        return row_stats, col_stats
+
+    def quant_to_int8(A, stats):
+        return torch.clamp(torch.round(A / stats * 127).to(torch.int8), -128, 127)
+
+    if threshold == 0.0:
+        if row_stats is None or col_stats is None:
+            row_stats, col_stats = get_row_col_stats(A)
+    else:
+        outlier_indices = torch.abs(A) > threshold # find outliers
+        outlier_coord = outlier_indices.nonzero() # get outlier coordinates
+        outlier_rows = outlier_coord[:, 0] # outlier row for COO sparse tensor
+        outlier_cols = outlier_coord[:, 1] # outlier column for COO sparse tensor
+        outlier_values = A[outlier_indices] # outlier values for COO sparse tensor
+        coo_tensor = COOSparseTensor(
+            A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
+        )
+        if row_stats is None or col_stats is None:
+            A[outlier_indices] = 0 # zero out outliers
+            row_stats, col_stats = get_row_col_stats(A)
+            A[outlier_indices] = outlier_values # restore outliers for later use
+
+    quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
+    quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
+    if out_row is not None:
+        out_row.copy_(quant_by_row)
+    else:
+        out_row = quant_by_row
+    if out_col is not None:
+        out_col.copy_(quant_by_col)
+    else:
+        out_col = quant_by_col
+    return out_row, out_col, row_stats, col_stats, coo_tensor
+
+
+def igemmlt_common(
+    A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32
+):
+    """
+    Do GEMMM computation. Data type: int8 * int8 -> int32.
+    Args:
+        A Activation of linear, data type is int8
+        B Weight of linear, data type is int8
+        SA Not used for CPU/XPU
+        SB Not used for CPU/XPU
+        out Specified output tensor if it is not None
+        Sout Not used for CPU/XPU but returned as is
+        dtype Data type of output
+    Return:
+        A tuple of GEMM result in dtype and Sout
+    """
+    assert A.dtype == torch.int8
+    assert B.dtype == torch.int8
+    if out is not None:
+        assert out.dtype == dtype
+
+    dimsA = A.ndim
+    dimsB = B.ndim
+    shapeA = A.shape
+    shapeB = B.shape
+    assert dimsA in [2, 3], 'Only two or three dimensional matrices are supported for argument A'
+    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+
+    if dimsA == 2:
+        m = shapeA[0]
+    elif dimsA == 3:
+        m = shapeA[0] * shapeA[1]
+    n = shapeB[0]
+    k = shapeA[-1]
+    assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
+    shapeOut = (shapeA[0], shapeA[1], n) if dimsA == 3 else (m, n)
+
+    # if the tensor is empty, return a transformed empty tensor with the right dimensions
+    if shapeA[0] == 0 and dimsA == 2:
+        return torch.empty((0, n), device=A.device, dtype=A.dtype)
+    elif shapeA[1] == 0 and dimsA == 3:
+        return torch.empty(tuple(shapeA[:2] + [n]), device=A.device, dtype=A.dtype)
+
+    A_reshaped = A.reshape(m, k)
+
+    if assert_on_cpu([A_reshaped, B]):
+        C = torch._int_mm(A_reshaped, B.T).to(dtype)
+    else:
+        C = torch.nn.functional.linear(A_reshaped, B).to(dtype)
+    if C.ndim != dimsA:
+        C = C.reshape(shapeOut)
+    if out is not None:
+        out.copy_(C)
+    else:
+        out = C
+
+    return out, Sout
+
+
+@torch.compile(dynamic=True, options={"fx_graph_cache": True})
+def mm_dequant_common(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None,
+    compute_dtype=torch.float32,
+    output_dtype=torch.float32
+):
+    """
+    Dequant and add bias
+    out = A_int32 * (scale_A, scale_B) / 127 * 127 + bias
+    Args:
+        A The output of int8 gemm, whose dtype is int32
+        quant_state Not used for CPU
+        row_stats Absolute max value of each row of input (A) of gemm
+        col_stats Absolute max value of each row of weight (B) of gemm
+        out Output buffer
+        new_row_stats Not used for CPU/XPU
+        new_col_stats Not used for CPU/XPU
+        bias Bias of linear
+        compute_dtype Data type for computation
+        output_dtype Data type for output
+    Return:
+        The result
+    """
+    assert A.dtype == torch.int32
+    out_shape = A.shape
+    if len(out_shape) == 3:
+        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+    A_reshaped = A.reshape(out_shape).to(compute_dtype)
+    row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
+    col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
+    out = A_reshaped * row_stats * col_stats / (127 * 127)
+    if bias is not None:
+        out = out + bias.to(compute_dtype)
+    out = out.to(output_dtype)
+    return out
+
+
+class CPUBackend:
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
+    @classmethod
+    def double_quant(
+        cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+    ):
+        assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
+        return double_quant_common(A, col_stats, row_stats, out_col, out_row)
+
+    @classmethod
+    def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For CPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_cpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
+
+    @classmethod
+    def igemmlt(cls, A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
+        assert_on_cpu([A, B])
+        return igemmlt_common(A, B, SA, SB, out, Sout, dtype)
+
+    @classmethod
+    def mm_dequant(
+        cls,
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None
+    ):
+        assert_on_cpu([A, row_stats, col_stats, out, bias])
+        return mm_dequant_common(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            cls.mm_dequant_compute_dtype,
+            cls.mm_dequant_output_dtype
+        )
+
+    @classmethod
+    def extract_outliers(cls, A, SA, idx):
+        """
+        Extract columns of A by idx
+        """
+        assert_on_cpu([A])
+        return A[:, idx].contiguous()
+
+    @classmethod
+    def quantize_4bit(
+        cls,
+        A: Tensor,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+    ) -> Tensor:
+        assert False, "quantize_4bit not yet implemented for CPU backend"
+
+    @classmethod
+    def dequantize_4bit(
+        cls,
+        A: Tensor,
+        quant_state = None,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> Tensor:
+        assert False, "dequantize_4bit not yet implemented for CPU backend"
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
new file mode 100644
index 000000000..9ee8a09dc
--- /dev/null
+++ b/bitsandbytes/backends/xpu.py
@@ -0,0 +1,118 @@
+# For Intel GPU (xpu is the device name for Intel GPU in PyTorch)
+import torch
+from .cpu import (
+    double_quant_common,
+    igemmlt_common,
+    mm_dequant_common,
+)
+
+Tensor = torch.Tensor
+
+def assert_on_xpu(tensors):
+    on_xpu = True
+    for t in tensors:
+        if t is None: continue # NULL pointers are fine
+        on_xpu &= (t.device.type == 'xpu')
+    if not on_xpu:
+        raise TypeError(
+            'All input tensors need to be on XPU, but found some tensors to not be on XPU:\n' \
+            f' {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}'
+        )
+    return on_xpu
+
+
+class XPUBackend:
+    mm_dequant_compute_dtype = torch.half
+    mm_dequant_output_dtype = torch.half
+
+    @classmethod
+    @torch.compile(dynamic=True, options={"fx_graph_cache": True})
+    def double_quant(
+        cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+    ):
+        assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
+        return double_quant_common(A, col_stats, row_stats, out_col, out_row)
+
+    @classmethod
+    def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For XPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_xpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
+
+    @classmethod
+    def igemmlt(cls, A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
+        assert_on_xpu([A, B])
+        return igemmlt_common(A, B, SA, SB, out, Sout, dtype)
+
+    @classmethod
+    @torch.compile(dynamic=True, options={"fx_graph_cache": True})
+    def mm_dequant(
+        cls,
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None
+    ):
+        assert_on_xpu([A, row_stats, col_stats, out, bias])
+        return mm_dequant_common(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            cls.mm_dequant_compute_dtype,
+            cls.mm_dequant_output_dtype
+        )
+
+    @classmethod
+    def extract_outliers(cls, A, SA, idx):
+        """
+        Extract columns of A by idx
+        """
+        assert_on_xpu([A])
+        return A[:, idx].contiguous()
+
+    @classmethod
+    def quantize_4bit(
+        cls,
+        A: Tensor,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+    ) -> Tensor:
+        assert False, "quantize_4bit not yet implemented for XPU backend"
+
+    @classmethod
+    def dequantize_4bit(
+        cls,
+        A: Tensor,
+        quant_state = None,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> Tensor:
+        assert False, "dequantize_4bit not yet implemented for XPU backend"
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6bb02944d..baba76963 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1945,7 +1945,10 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
+        if values.device == torch.device('cpu'):
+            assert values.dtype in [torch.bfloat16, torch.float]
+        else:
+            assert values.dtype == torch.float16
         assert values.numel() == nnz
         assert rowidx.numel() == nnz
         assert colidx.numel() == nnz
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ec14e5940..bcba8b3d2 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -581,6 +581,32 @@ def cuda(self, device):
 
         return self
 
+    def cpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().bfloat16().cpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        setattr(self, "CB", CB)
+        setattr(self, "SCB", SCB)
+        return self
+
+    def xpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().half().cpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        setattr(self, "CB", CB)
+        setattr(self, "SCB", SCB)
+        return self
+
     @overload
     def to(
         self: T,
@@ -600,6 +626,18 @@ def to(self, *args, **kwargs):
 
         if device is not None and device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
+        elif (
+            device is not None
+            and device.type == "xpu"
+            and self.data.dtype != torch.int8
+        ):
+            return self.xpu()
+        elif (
+            device is not None
+            and device.type == "cpu"
+            and self.data.dtype != torch.int8
+        ):
+            return self.cpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),
diff --git a/examples/int8_inference_huggingface_cpu.py b/examples/int8_inference_huggingface_cpu.py
new file mode 100644
index 000000000..b41605893
--- /dev/null
+++ b/examples/int8_inference_huggingface_cpu.py
@@ -0,0 +1,32 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import time
+
+MAX_NEW_TOKENS = 64
+model_id = "facebook/opt-1.3b"
+
+text = 'Hamburg is in which country?\n'
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+input_ids = tokenizer(text, return_tensors="pt").input_ids
+
+print('Loading model {}...'.format(model_id))
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForCausalLM.from_pretrained(
+  model_id,
+  device_map='auto',
+  quantization_config=quantization_config,
+  torch_dtype=torch.bfloat16
+)
+print('model dtype = {}'.format(model.dtype))
+
+with torch.no_grad():
+    t0 = time.time()
+    generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
+    latency = time.time() - t0
+    result = "| latency: " + str(round(latency * 1000, 3)) + " ms |"
+    print('+' + '-' * (len(result) - 2) + '+')
+    print(result)
+    print('+' + '-' * (len(result) - 2) + '+')
+
+output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+print(f"output: {output}")

From 77be40bda0ee724b1d734f107534f04846e64e8a Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Mon, 15 Apr 2024 15:48:20 +0800
Subject: [PATCH 106/234] Remove XPU code; remove cpu example; add UT

---
 bitsandbytes/__init__.py                   |  11 +-
 bitsandbytes/backends/cpu.py               | 186 +------------------
 bitsandbytes/backends/cpu_xpu_common.py    | 203 +++++++++++++++++++++
 bitsandbytes/backends/xpu.py               | 118 ------------
 bitsandbytes/functional.py                 |   2 +-
 examples/int8_inference_huggingface_cpu.py |  32 ----
 tests/test_functional.py                   | 103 +++++++++--
 7 files changed, 302 insertions(+), 353 deletions(-)
 create mode 100644 bitsandbytes/backends/cpu_xpu_common.py
 delete mode 100644 bitsandbytes/backends/xpu.py
 delete mode 100644 examples/int8_inference_huggingface_cpu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 0dae37e8d..48144a870 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,20 +17,15 @@
 from .nn import modules
 from .backends import register_backend
 
+from .backends.cpu import CPUBackend
+register_backend("cpu", CPUBackend)
+
 if lib and lib.compiled_with_cuda:
     from .backends.cuda import CUDABackend
     from .optim import adam
 
     register_backend("cuda", CUDABackend())
 
-elif torch.xpu.is_available():
-    from .backends.xpu import XPUBackend
-    register_backend("xpu", XPUBackend)
-
-else:
-    from .backends.cpu import CPUBackend
-    register_backend("cpu", CPUBackend)
-
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 31bb52945..82c411166 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -1,4 +1,9 @@
 import torch
+from .cpu_xpu_common import (
+    double_quant_impl,
+    igemmlt_impl,
+    mm_dequant_impl,
+)
 
 
 Tensor = torch.Tensor
@@ -17,181 +22,6 @@ def assert_on_cpu(tensors):
     return on_cpu
 
 
-@torch.compile(dynamic=True, options={"fx_graph_cache": True})
-def double_quant_common(
-    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-):
-    """
-    Find absolute max valus of each row/column of a tensor, and symmetrically quantize it to int8.
-    If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
-    the original tensor and they are kept in COO format: (rows, cols, valus)
-    If threashold == 0.0, there are no outliers.
-    Args:
-        A The tensor to be analyzed and quantized.
-        col_stats Absolute max values of each column of A. If it is not None, use the values directly.
-            Otherwise, find the values.
-        row_stats Absolute max values of each row of A. If it is not None, use the values directly.
-            Otherwise, find the values.
-        out_col Output buffer for the result quantized per column if it is not None
-        out_row Output buffer for the result quantized per row if it is not None
-        threshold The threshold for finding outliers if it is > 0.0. Otherwise it has no effect.
-    Return:
-        A tuple of output quantized per row, output quantized per column, absolute max values of
-        each row of A, absolute max values of each column of A, outliers in COO format
-    """
-    from ..functional import COOSparseTensor
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        assert A.dim() == 2, f"double_quant: Input tensor should be 2d or 3d but got {A.dim()}d"
-        rows = A.shape[0]
-    A = A.reshape(rows, cols)
-
-    coo_tensor = None
-
-    def get_row_col_stats(A):
-        row_stats = torch.max(torch.abs(A), 1).values # absolute max of each row
-        col_stats = torch.max(torch.abs(A), 0).values # absolute max of each col
-        return row_stats, col_stats
-
-    def quant_to_int8(A, stats):
-        return torch.clamp(torch.round(A / stats * 127).to(torch.int8), -128, 127)
-
-    if threshold == 0.0:
-        if row_stats is None or col_stats is None:
-            row_stats, col_stats = get_row_col_stats(A)
-    else:
-        outlier_indices = torch.abs(A) > threshold # find outliers
-        outlier_coord = outlier_indices.nonzero() # get outlier coordinates
-        outlier_rows = outlier_coord[:, 0] # outlier row for COO sparse tensor
-        outlier_cols = outlier_coord[:, 1] # outlier column for COO sparse tensor
-        outlier_values = A[outlier_indices] # outlier values for COO sparse tensor
-        coo_tensor = COOSparseTensor(
-            A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
-        )
-        if row_stats is None or col_stats is None:
-            A[outlier_indices] = 0 # zero out outliers
-            row_stats, col_stats = get_row_col_stats(A)
-            A[outlier_indices] = outlier_values # restore outliers for later use
-
-    quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
-    quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
-    if out_row is not None:
-        out_row.copy_(quant_by_row)
-    else:
-        out_row = quant_by_row
-    if out_col is not None:
-        out_col.copy_(quant_by_col)
-    else:
-        out_col = quant_by_col
-    return out_row, out_col, row_stats, col_stats, coo_tensor
-
-
-def igemmlt_common(
-    A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32
-):
-    """
-    Do GEMMM computation. Data type: int8 * int8 -> int32.
-    Args:
-        A Activation of linear, data type is int8
-        B Weight of linear, data type is int8
-        SA Not used for CPU/XPU
-        SB Not used for CPU/XPU
-        out Specified output tensor if it is not None
-        Sout Not used for CPU/XPU but returned as is
-        dtype Data type of output
-    Return:
-        A tuple of GEMM result in dtype and Sout
-    """
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    if out is not None:
-        assert out.dtype == dtype
-
-    dimsA = A.ndim
-    dimsB = B.ndim
-    shapeA = A.shape
-    shapeB = B.shape
-    assert dimsA in [2, 3], 'Only two or three dimensional matrices are supported for argument A'
-    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
-
-    if dimsA == 2:
-        m = shapeA[0]
-    elif dimsA == 3:
-        m = shapeA[0] * shapeA[1]
-    n = shapeB[0]
-    k = shapeA[-1]
-    assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
-    shapeOut = (shapeA[0], shapeA[1], n) if dimsA == 3 else (m, n)
-
-    # if the tensor is empty, return a transformed empty tensor with the right dimensions
-    if shapeA[0] == 0 and dimsA == 2:
-        return torch.empty((0, n), device=A.device, dtype=A.dtype)
-    elif shapeA[1] == 0 and dimsA == 3:
-        return torch.empty(tuple(shapeA[:2] + [n]), device=A.device, dtype=A.dtype)
-
-    A_reshaped = A.reshape(m, k)
-
-    if assert_on_cpu([A_reshaped, B]):
-        C = torch._int_mm(A_reshaped, B.T).to(dtype)
-    else:
-        C = torch.nn.functional.linear(A_reshaped, B).to(dtype)
-    if C.ndim != dimsA:
-        C = C.reshape(shapeOut)
-    if out is not None:
-        out.copy_(C)
-    else:
-        out = C
-
-    return out, Sout
-
-
-@torch.compile(dynamic=True, options={"fx_graph_cache": True})
-def mm_dequant_common(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None,
-    compute_dtype=torch.float32,
-    output_dtype=torch.float32
-):
-    """
-    Dequant and add bias
-    out = A_int32 * (scale_A, scale_B) / 127 * 127 + bias
-    Args:
-        A The output of int8 gemm, whose dtype is int32
-        quant_state Not used for CPU
-        row_stats Absolute max value of each row of input (A) of gemm
-        col_stats Absolute max value of each row of weight (B) of gemm
-        out Output buffer
-        new_row_stats Not used for CPU/XPU
-        new_col_stats Not used for CPU/XPU
-        bias Bias of linear
-        compute_dtype Data type for computation
-        output_dtype Data type for output
-    Return:
-        The result
-    """
-    assert A.dtype == torch.int32
-    out_shape = A.shape
-    if len(out_shape) == 3:
-        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
-
-    A_reshaped = A.reshape(out_shape).to(compute_dtype)
-    row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
-    col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
-    out = A_reshaped * row_stats * col_stats / (127 * 127)
-    if bias is not None:
-        out = out + bias.to(compute_dtype)
-    out = out.to(output_dtype)
-    return out
-
-
 class CPUBackend:
     mm_dequant_compute_dtype = torch.bfloat16
     mm_dequant_output_dtype = torch.bfloat16
@@ -201,7 +31,7 @@ def double_quant(
         cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
     ):
         assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
-        return double_quant_common(A, col_stats, row_stats, out_col, out_row)
+        return double_quant_impl(A, col_stats, row_stats, out_col, out_row)
 
     @classmethod
     def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
@@ -226,7 +56,7 @@ def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False
     @classmethod
     def igemmlt(cls, A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
         assert_on_cpu([A, B])
-        return igemmlt_common(A, B, SA, SB, out, Sout, dtype)
+        return igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
 
     @classmethod
     def mm_dequant(
@@ -241,7 +71,7 @@ def mm_dequant(
         bias=None
     ):
         assert_on_cpu([A, row_stats, col_stats, out, bias])
-        return mm_dequant_common(
+        return mm_dequant_impl(
             A,
             quant_state,
             row_stats,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
new file mode 100644
index 000000000..e6bc59075
--- /dev/null
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -0,0 +1,203 @@
+import torch
+
+
+Tensor = torch.Tensor
+
+
+def _torch_version_prereq(major, minor):
+    ver_major = int(torch.__version__.split('.')[0])
+    ver_minor = int(torch.__version__.split('.')[1])
+    return ver_major * 32 + ver_minor >= major * 32 + minor
+
+
+def _maybe_torch_compile(func):
+    # torch.compile requires pytorch >= 2.0
+    if _torch_version_prereq(2, 0):
+        options = {}
+        # fx_graph_cache requires pytorch >= 2.2
+        if _torch_version_prereq(2, 2):
+            options.update({"fx_graph_cache": True})
+        return torch.compile(func, dynamic=True, options=options)
+    return func
+
+
+@_maybe_torch_compile
+def double_quant_impl(
+    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
+):
+    """
+    Find absolute max valus of each row/column of a tensor, and symmetrically quantize it to int8.
+    If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
+    the original tensor and they are kept in COO format: (rows, cols, valus)
+    If threashold == 0.0, there are no outliers.
+    Args:
+        A The tensor to be analyzed and quantized.
+        col_stats Absolute max values of each column of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        row_stats Absolute max values of each row of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        out_col Output buffer for the result quantized per column if it is not None
+        out_row Output buffer for the result quantized per row if it is not None
+        threshold The threshold for finding outliers if it is > 0.0. Otherwise it has no effect.
+    Return:
+        A tuple of output quantized per row, output quantized per column, absolute max values of
+        each row of A, absolute max values of each column of A, outliers in COO format
+    """
+    from ..functional import COOSparseTensor
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        assert A.dim() == 2, f"double_quant: Input tensor should be 2d or 3d but got {A.dim()}d"
+        rows = A.shape[0]
+    A = A.reshape(rows, cols)
+
+    coo_tensor = None
+
+    def get_row_col_stats(A):
+        row_stats = torch.max(torch.abs(A), 1).values # absolute max of each row
+        col_stats = torch.max(torch.abs(A), 0).values # absolute max of each col
+        return row_stats, col_stats
+
+    def quant_to_int8(A, stats):
+        return torch.clamp(torch.round(A * (127.0 / stats)), -128, 127).to(torch.int8)
+
+    if threshold == 0.0:
+        if row_stats is None or col_stats is None:
+            row_stats, col_stats = get_row_col_stats(A)
+    else:
+        outlier_indices = torch.abs(A) > threshold # find outliers
+        outlier_coord = outlier_indices.nonzero() # get outlier coordinates
+        outlier_rows = outlier_coord[:, 0] # outlier row for COO sparse tensor
+        outlier_cols = outlier_coord[:, 1] # outlier column for COO sparse tensor
+        outlier_values = A[outlier_indices] # outlier values for COO sparse tensor
+        coo_tensor = COOSparseTensor(
+            A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
+        )
+        if row_stats is None or col_stats is None:
+            A[outlier_indices] = 0 # zero out outliers
+            row_stats, col_stats = get_row_col_stats(A)
+            A[outlier_indices] = outlier_values # restore outliers for later use
+
+    quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
+    quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
+    if out_row is not None:
+        out_row.copy_(quant_by_row)
+    else:
+        out_row = quant_by_row
+    if out_col is not None:
+        out_col.copy_(quant_by_col)
+    else:
+        out_col = quant_by_col
+    # Return float stats to align with CUDA impl
+    return out_row, out_col, row_stats.float(), col_stats.float(), coo_tensor
+
+
+def igemmlt_impl(
+    A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32
+):
+    """
+    Do GEMMM computation. Data type: int8 * int8 -> int32.
+    Args:
+        A Activation of linear, data type is int8
+        B Weight of linear, data type is int8
+        SA Not used for CPU/XPU
+        SB Not used for CPU/XPU
+        out Specified output tensor if it is not None
+        Sout Not used for CPU/XPU but returned as is
+        dtype Data type of output
+    Return:
+        A tuple of GEMM result in dtype and Sout
+    """
+    assert A.dtype == torch.int8
+    assert B.dtype == torch.int8
+    if out is not None:
+        assert out.dtype == dtype
+
+    dimsA = A.ndim
+    dimsB = B.ndim
+    shapeA = A.shape
+    shapeB = B.shape
+    assert dimsA in [2, 3], 'Only two or three dimensional matrices are supported for argument A'
+    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+
+    if dimsA == 2:
+        m = shapeA[0]
+    elif dimsA == 3:
+        m = shapeA[0] * shapeA[1]
+    if shapeA[-1] == shapeB[0]:
+        B = B.t()
+        shapeB = B.shape
+    else:
+        assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
+    n = shapeB[0]
+    k = shapeA[-1]
+
+    # if the tensor is empty, return a transformed empty tensor with the right dimensions
+    if shapeA[0] == 0 and dimsA == 2:
+        return torch.empty((0, n), device=A.device, dtype=A.dtype)
+    elif shapeA[1] == 0 and dimsA == 3:
+        return torch.empty(tuple(shapeA[:2] + [n]), device=A.device, dtype=A.dtype)
+
+    A_reshaped = A.reshape(m, k)
+
+    # torch._int_mm is available on CPU since torch 2.4
+    if _torch_version_prereq(2, 4):
+        C = torch._int_mm(A_reshaped, B.T).to(dtype)
+    else:
+        C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)
+    if C.ndim != dimsA:
+        assert dimsA == 3
+        shapeOut = (shapeA[0], m // shapeA[0], C.shape[-1])
+        C = C.reshape(shapeOut)
+    if out is not None:
+        out.copy_(C)
+    else:
+        out = C
+
+    return out, Sout
+
+
+@_maybe_torch_compile
+def mm_dequant_impl(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None,
+    compute_dtype=torch.float32,
+    output_dtype=torch.float32
+):
+    """
+    Dequant and add bias
+    out = A_int32 * (abs_max_A * abs_max_B) / 127 * 127 + bias
+    Args:
+        A The output of int8 gemm, whose dtype is int32
+        quant_state Not used for CPU
+        row_stats Absolute max value of each row of input (A) of gemm
+        col_stats Absolute max value of each row of weight (B) of gemm
+        out Output buffer
+        new_row_stats Not used for CPU/XPU
+        new_col_stats Not used for CPU/XPU
+        bias Bias of linear
+        compute_dtype Data type for computation
+        output_dtype Data type for output
+    Return:
+        The result
+    """
+    assert A.dtype == torch.int32
+    out_shape = A.shape
+    if len(out_shape) == 3:
+        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+    A_reshaped = A.reshape(out_shape).to(compute_dtype)
+    row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
+    col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
+    out = A_reshaped * row_stats * col_stats / (127 * 127)
+    if bias is not None:
+        out = out + bias.to(compute_dtype)
+    out = out.to(output_dtype)
+    return out
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
deleted file mode 100644
index 9ee8a09dc..000000000
--- a/bitsandbytes/backends/xpu.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# For Intel GPU (xpu is the device name for Intel GPU in PyTorch)
-import torch
-from .cpu import (
-    double_quant_common,
-    igemmlt_common,
-    mm_dequant_common,
-)
-
-Tensor = torch.Tensor
-
-def assert_on_xpu(tensors):
-    on_xpu = True
-    for t in tensors:
-        if t is None: continue # NULL pointers are fine
-        on_xpu &= (t.device.type == 'xpu')
-    if not on_xpu:
-        raise TypeError(
-            'All input tensors need to be on XPU, but found some tensors to not be on XPU:\n' \
-            f' {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}'
-        )
-    return on_xpu
-
-
-class XPUBackend:
-    mm_dequant_compute_dtype = torch.half
-    mm_dequant_output_dtype = torch.half
-
-    @classmethod
-    @torch.compile(dynamic=True, options={"fx_graph_cache": True})
-    def double_quant(
-        cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-    ):
-        assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
-        return double_quant_common(A, col_stats, row_stats, out_col, out_row)
-
-    @classmethod
-    def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
-        """
-        Transform tensor A to to_order. It is originally designed for CUDA.
-        For XPU, it returns the original tensor if transpose=False.
-        Otherwise, it returns the transpose of A
-        """
-        assert_on_xpu([A, out])
-        if transpose:
-            if out is not None:
-                out.copy_(A.T)
-            else:
-                out = A.T
-        else:
-            if out is not None:
-                out.copy_(A)
-            else:
-                out = A
-        return out, state
-
-    @classmethod
-    def igemmlt(cls, A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
-        assert_on_xpu([A, B])
-        return igemmlt_common(A, B, SA, SB, out, Sout, dtype)
-
-    @classmethod
-    @torch.compile(dynamic=True, options={"fx_graph_cache": True})
-    def mm_dequant(
-        cls,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None
-    ):
-        assert_on_xpu([A, row_stats, col_stats, out, bias])
-        return mm_dequant_common(
-            A,
-            quant_state,
-            row_stats,
-            col_stats,
-            out,
-            new_row_stats,
-            new_col_stats,
-            bias,
-            cls.mm_dequant_compute_dtype,
-            cls.mm_dequant_output_dtype
-        )
-
-    @classmethod
-    def extract_outliers(cls, A, SA, idx):
-        """
-        Extract columns of A by idx
-        """
-        assert_on_xpu([A])
-        return A[:, idx].contiguous()
-
-    @classmethod
-    def quantize_4bit(
-        cls,
-        A: Tensor,
-        absmax: Tensor = None,
-        out: Tensor = None,
-        blocksize=64,
-        compress_statistics=False,
-        quant_type="fp4",
-    ) -> Tensor:
-        assert False, "quantize_4bit not yet implemented for XPU backend"
-
-    @classmethod
-    def dequantize_4bit(
-        cls,
-        A: Tensor,
-        quant_state = None,
-        absmax: Tensor = None,
-        out: Tensor = None,
-        blocksize: int = 64,
-        quant_type="fp4",
-    ) -> Tensor:
-        assert False, "dequantize_4bit not yet implemented for XPU backend"
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index baba76963..54a161f7a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2177,7 +2177,7 @@ def vectorwise_quant(x, dim=1, quant_type="vector"):
         return xq, max1
     elif quant_type in ["vector", "row"]:
         max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
-        xq = torch.round(x * (C / max1)).to(torch.int8)
+        xq = torch.clamp(torch.round(x * (C / max1)), -128, 127).to(torch.int8)
         return xq, max1
     elif quant_type == "zeropoint":
         dtype = x.dtype
diff --git a/examples/int8_inference_huggingface_cpu.py b/examples/int8_inference_huggingface_cpu.py
deleted file mode 100644
index b41605893..000000000
--- a/examples/int8_inference_huggingface_cpu.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import time
-
-MAX_NEW_TOKENS = 64
-model_id = "facebook/opt-1.3b"
-
-text = 'Hamburg is in which country?\n'
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-input_ids = tokenizer(text, return_tensors="pt").input_ids
-
-print('Loading model {}...'.format(model_id))
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-model = AutoModelForCausalLM.from_pretrained(
-  model_id,
-  device_map='auto',
-  quantization_config=quantization_config,
-  torch_dtype=torch.bfloat16
-)
-print('model dtype = {}'.format(model.dtype))
-
-with torch.no_grad():
-    t0 = time.time()
-    generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
-    latency = time.time() - t0
-    result = "| latency: " + str(round(latency * 1000, 3)) + " ms |"
-    print('+' + '-' * (len(result) - 2) + '+')
-    print(result)
-    print('+' + '-' * (len(result) - 2) + '+')
-
-output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-print(f"output: {output}")
diff --git a/tests/test_functional.py b/tests/test_functional.py
index b9f1a6ead..cf4088c00 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -576,28 +576,37 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("dim4", get_test_dims(32, 1024, n=1), ids=id_formatter("dim4"))
 @pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims"))
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
-def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
+@pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
+def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
     for i in range(k):
         if dims == 2:
-            A = torch.randint(-128, 127, size=(dim1, dim3), device="cuda").to(torch.int8)
+            A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)
         elif dims == 3:
-            A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(torch.int8)
-        B = torch.randint(-128, 127, size=(dim4, dim3), device="cuda").to(torch.int8)
+            A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device=device).to(torch.int8)
+        B = torch.randint(-128, 127, size=(dim4, dim3), device=device).to(torch.int8)
         C1 = torch.matmul(A.float(), B.t().float())
 
         A2, SA = F.transform(A, "col32")
         B2, SB = F.transform(B, "col_turing")
         C2, SC = F.igemmlt(A2, B2, SA, SB)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
+        if device == "cpu":
+            assert SC is None
+        if device == "cuda":
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+        else:
+            C3, S = C2, None
         torch.testing.assert_close(C1, C3.float())
 
         # transpose
-        B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(torch.int8)
+        B = torch.randint(-128, 127, size=(dim3, dim4), device=device).to(torch.int8)
         C1 = torch.matmul(A.float(), B.float())
 
         B2t, SBt = F.transform(B, "col_turing", transpose=True)
         C2, SC = F.igemmlt(A2, B2t, SA, SBt)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
+        if device == "cuda":
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+        else:
+            C3, S = C2, None
         torch.testing.assert_close(C1, C3.float())
 
 
@@ -846,6 +855,33 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
         assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n))
 
 
+@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
+@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
+def test_dequant_mm_cpu(dim1, dim4, dims, has_bias):
+    inner = torch.randint(1, 128, size=(1,)).item()
+    bias = None
+    if has_bias:
+        bias = torch.randn(dim4, device="cpu", dtype=torch.bfloat16)
+    for i in range(1):
+        A = torch.randn(dim1, inner, device="cpu")
+        B = torch.randn(dim4, inner, device="cpu")
+
+        A1, maxA = F.vectorwise_quant(A, dim=1)
+        B1, maxB = F.vectorwise_quant(B, dim=1)
+
+        C2, SC = F.igemmlt(A1, B1, SA=None, SB=None)
+        assert SC is None
+
+        C3 = F.vectorwise_mm_dequant(C2.bfloat16(), maxA, maxB.t())
+        if has_bias:
+            C3 += bias
+
+        C4 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias)
+        torch.testing.assert_close(C3.float(), C4.float(), atol=0.05, rtol=0.1)
+
+
 @pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
@@ -892,9 +928,13 @@ def test_colrow_absmax(dim1, dim2, dims):
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-def test_double_quant(dim1, dim2):
+@pytest.mark.parametrize("device", ["cuda","cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
+def test_double_quant(dim1, dim2, device, dtype):
+    if device == "cuda" and dtype == torch.bfloat16:
+        pytest.skip("BFloat16 not supported on CUDA")
     for i in range(k):
-        A = torch.randn(dim1, dim2, device="cuda").half()
+        A = torch.randn(dim1, dim2, device=device).to(dtype)
         out_col1, Scol = F.vectorwise_quant(A, dim=0)
         out_row1, Srow = F.vectorwise_quant(A, dim=1)
 
@@ -1125,6 +1165,33 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
         torch.testing.assert_close(out1, out2)
 
 
+@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims"))
+@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype)
+@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
+@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
+def test_transform_cpu(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
+    for i in range(k):
+        if dims == 2:
+            A = torch.randint(10, 99, size=(dim1, dim2), device="cpu").to(dtype)
+        elif dims == 3:
+            A = torch.randint(10, 99, size=(dim1, dim2, dim3), device="cpu").to(dtype)
+
+        A.view(-1)[-1] = -1
+        if transpose:
+            out1 = A.t().contiguous()
+        else:
+            out1 = A
+        out2, S2 = F.transform(A, to_order=orderOut, transpose=transpose)
+
+        assert S2 is None
+
+        torch.testing.assert_close(out1, out2)
+
+
 def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
@@ -1141,10 +1208,14 @@ def test_overflow():
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-def test_coo_double_quant(dim1, dim2):
+@pytest.mark.parametrize("device", ["cuda","cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
+def test_coo_double_quant(dim1, dim2, device, dtype):
+    if device == "cuda" and dtype == torch.bfloat16:
+        pytest.skip("BFloat16 not supported on CUDA")
     threshold = 3.00
     for i in range(k):
-        A = torch.randn(dim1, dim2, device="cuda").half()
+        A = torch.randn(dim1, dim2, device=device).to(dtype)
 
         idx = torch.abs(A) >= threshold
         CA2, CAt, statsA, statsAt, coo_tensor = F.double_quant(A)
@@ -1157,7 +1228,7 @@ def test_coo_double_quant(dim1, dim2):
             torch.testing.assert_close(A1, A2)
 
             A1 = A * (idx == 0)
-            A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
+            A2 = (CA.float() * statsA.unsqueeze(1) / 127).to(dtype)
             torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
 
 
@@ -1729,12 +1800,12 @@ def quant_zp(x):
     print(err1, err2, err3, err4, err5, err6)
 
 
-def test_extract_outliers():
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_extract_outliers(device):
     for i in range(k):
         shapeA = (4096, 4096 * 4)
-        idx = torch.unique(torch.randint(0, shapeA[1], size=(10,)).int()).cuda()
-        # idx = torch.Tensor([0]).int().cuda()
-        A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
+        idx = torch.unique(torch.randint(0, shapeA[1], size=(10,)).int()).to(device=device)
+        A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8)
         outliers1 = A[:, idx.long()]
 
         CA, SA = F.transform(A, "col_turing")

From 8d0b695d8aadaa225b39352068a8dc7d999c4eae Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Mon, 15 Apr 2024 02:40:40 -0700
Subject: [PATCH 107/234] Fix igemmlt correctness issue

---
 bitsandbytes/backends/cpu_xpu_common.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index e6bc59075..5be83d8e3 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -125,13 +125,9 @@ def igemmlt_impl(
         m = shapeA[0]
     elif dimsA == 3:
         m = shapeA[0] * shapeA[1]
-    if shapeA[-1] == shapeB[0]:
-        B = B.t()
-        shapeB = B.shape
-    else:
-        assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
     n = shapeB[0]
     k = shapeA[-1]
+    assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
 
     # if the tensor is empty, return a transformed empty tensor with the right dimensions
     if shapeA[0] == 0 and dimsA == 2:

From 67d86611d5b4e34f5d8e8ebc1c1e08dddee671ae Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Wed, 17 Apr 2024 23:06:57 -0700
Subject: [PATCH 108/234] Bug fix for double_quant

---
 bitsandbytes/backends/cpu.py            |  2 +-
 bitsandbytes/backends/cpu_xpu_common.py | 11 +++++++++--
 bitsandbytes/nn/modules.py              | 19 -------------------
 tests/test_functional.py                |  4 +++-
 4 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 82c411166..5183e6485 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -31,7 +31,7 @@ def double_quant(
         cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
     ):
         assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
-        return double_quant_impl(A, col_stats, row_stats, out_col, out_row)
+        return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
 
     @classmethod
     def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 5be83d8e3..7c7927c88 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,4 +1,5 @@
 import torch
+import warnings
 
 
 Tensor = torch.Tensor
@@ -66,7 +67,7 @@ def quant_to_int8(A, stats):
         if row_stats is None or col_stats is None:
             row_stats, col_stats = get_row_col_stats(A)
     else:
-        outlier_indices = torch.abs(A) > threshold # find outliers
+        outlier_indices = torch.abs(A) >= threshold # find outliers
         outlier_coord = outlier_indices.nonzero() # get outlier coordinates
         outlier_rows = outlier_coord[:, 0] # outlier row for COO sparse tensor
         outlier_cols = outlier_coord[:, 1] # outlier column for COO sparse tensor
@@ -77,10 +78,13 @@ def quant_to_int8(A, stats):
         if row_stats is None or col_stats is None:
             A[outlier_indices] = 0 # zero out outliers
             row_stats, col_stats = get_row_col_stats(A)
-            A[outlier_indices] = outlier_values # restore outliers for later use
 
     quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
     quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
+
+    if coo_tensor is not None:
+        A[outlier_indices] = outlier_values # restore outliers for later use
+
     if out_row is not None:
         out_row.copy_(quant_by_row)
     else:
@@ -189,6 +193,9 @@ def mm_dequant_impl(
     if len(out_shape) == 3:
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
 
+    if compute_dtype not in [torch.float32, torch.bfloat16]:
+        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use float instead")
+        compute_dtype = torch.float32
     A_reshaped = A.reshape(out_shape).to(compute_dtype)
     row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
     col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index bcba8b3d2..c0efafec0 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -594,19 +594,6 @@ def cpu(self):
         setattr(self, "SCB", SCB)
         return self
 
-    def xpu(self):
-        # we store the 8-bit rows-major weight
-        B = self.data.contiguous().half().cpu()
-        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
-        if CBt is not None:
-            del CBt
-        if SCBt is not None:
-            del SCBt
-        self.data = CB
-        setattr(self, "CB", CB)
-        setattr(self, "SCB", SCB)
-        return self
-
     @overload
     def to(
         self: T,
@@ -626,12 +613,6 @@ def to(self, *args, **kwargs):
 
         if device is not None and device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
-        elif (
-            device is not None
-            and device.type == "xpu"
-            and self.data.dtype != torch.int8
-        ):
-            return self.xpu()
         elif (
             device is not None
             and device.type == "cpu"
diff --git a/tests/test_functional.py b/tests/test_functional.py
index cf4088c00..ba1e32d77 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1221,6 +1221,8 @@ def test_coo_double_quant(dim1, dim2, device, dtype):
         CA2, CAt, statsA, statsAt, coo_tensor = F.double_quant(A)
         CA, CAt, statsA, statsAt, coo_tensor = F.double_quant(A, threshold=threshold)
 
+        if idx.sum() > 0:
+            assert coo_tensor is not None
         if coo_tensor is not None:
             A1 = A * idx
             A2 = torch.zeros_like(A)
@@ -1229,7 +1231,7 @@ def test_coo_double_quant(dim1, dim2, device, dtype):
 
             A1 = A * (idx == 0)
             A2 = (CA.float() * statsA.unsqueeze(1) / 127).to(dtype)
-            torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
+            torch.testing.assert_close(A1, A2, rtol=0.05, atol=1.5e-2)
 
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1"))

From 92900f6cc82d0010909aa6eadc13bdc497fb36f9 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Thu, 18 Apr 2024 04:24:23 -0700
Subject: [PATCH 109/234] Remove torch.compile for double_quant

---
 bitsandbytes/__init__.py                | 1 -
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 bitsandbytes/nn/modules.py              | 5 ++---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 48144a870..cc7812e4e 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -25,7 +25,6 @@
     from .optim import adam
 
     register_backend("cuda", CUDABackend())
-
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 7c7927c88..c6573b6c0 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -22,7 +22,7 @@ def _maybe_torch_compile(func):
     return func
 
 
-@_maybe_torch_compile
+# Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
 def double_quant_impl(
     A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
 ):
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index c0efafec0..9295e4c70 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -611,11 +611,10 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type == "cuda" and self.data.device.type == "cpu":
+        if device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
         elif (
-            device is not None
-            and device.type == "cpu"
+            device.type == "cpu"
             and self.data.dtype != torch.int8
         ):
             return self.cpu()

From 79cb5548c7cbe8a129dea42e7e38feb1c1251979 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:11:04 +0000
Subject: [PATCH 110/234] Update gpu arch setting

---
 CMakeLists.txt | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b776005f..6b9b2dbe6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,7 +174,16 @@ if(BUILD_CUDA)
 elseif(BUILD_HIP)
     enable_language(HIP)
     message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
-    message(STATUS "HIP Targets: ${AMDGPU_TARGETS}")
+    if(DEFINED BNB_ROCM_ARCH)
+      set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
+    else()
+      if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx942")
+      elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+      endif()
+    endif()
+    message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
 
     list(APPEND SRC_FILES ${HIP_FILES})
 

From 5c0414e20545c3ae9162ab8428d10e290e2047f6 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:13:00 +0000
Subject: [PATCH 111/234] Add ROCM_PATH variable

---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b9b2dbe6..113c3d037 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -250,7 +250,12 @@ if(BUILD_CUDA)
     )
 endif()
 if(BUILD_HIP)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+    if(NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH /opt/rocm)
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH})
+    endif()
+    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
     macro(find_package_and_print_version PACKAGE_NAME)
       find_package("${PACKAGE_NAME}" ${ARGN})
       message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -264,8 +269,8 @@ if(BUILD_HIP)
     set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
     set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
 
-    target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include /opt/rocm/include /include)
-    target_link_directories(bitsandbytes PRIVATE /opt/rocm/lib /lib)
+    target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
+    target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
     target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
 
     target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)

From 47795f5586661bfe79558c975e163fc0a38e8b47 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:13:47 +0000
Subject: [PATCH 112/234] Add HIP_VERSION variable

---
 CMakeLists.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 113c3d037..373db6550 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,7 +188,12 @@ elseif(BUILD_HIP)
     list(APPEND SRC_FILES ${HIP_FILES})
 
     string(APPEND BNB_OUTPUT_NAME "_hip")
-    if(NO_CUBLASLT)
+
+    # get hip version
+    execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
+    string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+
+    if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
     add_compile_definitions(__HIP_PLATFORM_AMD__)
@@ -277,7 +282,7 @@ if(BUILD_HIP)
     set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP)
     set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)
 
-    if(NO_CUBLASLT)
+    if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
 	target_compile_definitions(bitsandbytes PUBLIC NO_HIPBLASLT)
     else()
 	find_package(hipblaslt)

From 6d9045241e61d2a8f29a5ad48325c1f25a347be9 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:14:42 +0000
Subject: [PATCH 113/234] Add BNB_HIP_VERSION variable

---
 bitsandbytes/cextension.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 157f3a65a..69cf0b15f 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -37,7 +37,10 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     The library is not guaranteed to exist at the returned path.
     """
     if torch.version.hip:
-        return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+        if BNB_HIP_VERSION < 601:
+            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+        else:
+            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -111,8 +114,12 @@ def get_native_library() -> BNBNativeLibrary:
 
 
 try:
+    if torch.version.hip:
+        hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
+        HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+    else:
+        HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
     lib = get_native_library()
-    HIP_ENVIRONMENT = True if torch.version.hip else False
 except Exception as e:
     lib = None
     logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)

From 049a2dc5147a2d7c179a5dc202546f518f5475a1 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:15:37 +0000
Subject: [PATCH 114/234] Update supports igemmlt based on HIP version

---
 bitsandbytes/autograd/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 15574d702..3eafd502a 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -8,7 +8,7 @@
 import torch
 
 import bitsandbytes.functional as F
-
+from bitsandbytes.cextension import BNB_HIP_VERSION
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
@@ -218,7 +218,7 @@ def backward(ctx, grad_output):
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
     if torch.version.hip:
-        return True
+        return False if BNB_HIP_VERSION < 601 else True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)

From 47a0bc3b63d0f9dcbdb97696af317f335ca2b2d4 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:16:17 +0000
Subject: [PATCH 115/234] Skip failing tests based on HIP version

---
 tests/test_autograd.py   | 2 ++
 tests/test_functional.py | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 9da665a2d..eafa01f0e 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -4,6 +4,7 @@
 import torch
 
 import bitsandbytes as bnb
+from bitsandbytes.cextension import BNB_HIP_VERSION
 from tests.helpers import (
     BOOLEAN_TRIPLES,
     BOOLEAN_TUPLES,
@@ -198,6 +199,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool
                 assert (idx == 0).sum().item() < n * 0.02
 
 
+@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1")
 @pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 0f817d1dc..13a43cb70 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,7 +11,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
-from bitsandbytes.cextension import HIP_ENVIRONMENT
+from bitsandbytes.cextension import HIP_ENVIRONMENT, BNB_HIP_VERSION
 from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
@@ -505,6 +505,7 @@ def test_vector_quant(dim1, dim2, dim3):
         assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n * 0.002))
 
 
+@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1")
 @pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
@@ -1733,6 +1734,7 @@ def quant_zp(x):
     print(err1, err2, err3, err4, err5, err6)
 
 
+@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1")
 def test_extract_outliers():
     for i in range(k):
         shapeA = (4096, 4096 * 4)

From 1b2a0951e227a349188c4dc74ebcf7029362bc35 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:18:46 +0000
Subject: [PATCH 116/234] pre-commit fixes

---
 bitsandbytes/autograd/_functions.py | 3 ++-
 tests/test_functional.py            | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 3eafd502a..18ca66b17 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -7,8 +7,9 @@
 
 import torch
 
-import bitsandbytes.functional as F
 from bitsandbytes.cextension import BNB_HIP_VERSION
+import bitsandbytes.functional as F
+
 
 # math.prod not compatible with python < 3.8
 def prod(iterable):
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 13a43cb70..04a898d4b 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,7 +11,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
-from bitsandbytes.cextension import HIP_ENVIRONMENT, BNB_HIP_VERSION
+from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT
 from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)

From 4515a2186a997edd8e4d91cc9b371e89322906bd Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 18 Apr 2024 23:54:57 +0000
Subject: [PATCH 117/234] Update README file

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 415679df9..9503ff1ff 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,13 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu
 
 **Installation for ROCm:**
 
-To install latest bitsandbytes (supported on ROCm 6.2):
+To install develop version:
 ```bash
 git clone --recurse https://github.com/ROCm/bitsandbytes
 cd bitsandbytes
 git checkout rocm_enabled
 pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S .
+cmake -DCOMPUTE_BACKEND=hip -S . (Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch)
 make
 pip install .
 ```

From 717245d4f377484de5bf67c22c58ac13fc2d02cc Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Thu, 18 Apr 2024 19:56:32 -0700
Subject: [PATCH 118/234] refine pytest.skip message

---
 tests/test_functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index ba1e32d77..566d8429f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -932,7 +932,7 @@ def test_colrow_absmax(dim1, dim2, dims):
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
 def test_double_quant(dim1, dim2, device, dtype):
     if device == "cuda" and dtype == torch.bfloat16:
-        pytest.skip("BFloat16 not supported on CUDA")
+        pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
     for i in range(k):
         A = torch.randn(dim1, dim2, device=device).to(dtype)
         out_col1, Scol = F.vectorwise_quant(A, dim=0)
@@ -1212,7 +1212,7 @@ def test_overflow():
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
 def test_coo_double_quant(dim1, dim2, device, dtype):
     if device == "cuda" and dtype == torch.bfloat16:
-        pytest.skip("BFloat16 not supported on CUDA")
+        pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
     threshold = 3.00
     for i in range(k):
         A = torch.randn(dim1, dim2, device=device).to(dtype)

From e7ef75fc8481ecb83f312a7c7a842b5d3c434000 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 19 Apr 2024 14:27:20 +0000
Subject: [PATCH 119/234] Update default arch list

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 373db6550..3bedefd51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,7 +178,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()

From c0d244c99bb169b19c313ede468234ef513776d7 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 19 Apr 2024 16:08:50 +0000
Subject: [PATCH 120/234] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9503ff1ff..377ca2e86 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ git clone --recurse https://github.com/ROCm/bitsandbytes
 cd bitsandbytes
 # Checkout branch as needed
 # for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.2 - rocm6.2_internal_testing
+# for rocm 6.x - rocm6.2_internal_testing
 git checkout <branch>
 make hip
 python setup.py install

From 79652a587d287436cf62aa4e28207a6233ab434a Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Mon, 22 Apr 2024 18:57:55 +0000
Subject: [PATCH 121/234] update igemmlt for hip

---
 bitsandbytes/backends/cuda.py | 50 +++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index c76bcaebd..5757cb20f 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from bitsandbytes.cextension import lib
+from bitsandbytes.cextension import lib, HIP_ENVIRONMENT
 from bitsandbytes.functional import (
     CUBLAS_Context,
     coo_zeros,
@@ -188,9 +188,15 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
             return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
         if dimsA == 2 and out is None:
-            out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
+            if HIP_ENVIRONMENT:
+                out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col", "row")
+            else:
+                out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
         elif dimsA == 3 and out is None:
-            out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
+            if HIP_ENVIRONMENT:
+                out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col", "row")
+            else:
+                out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
 
         assert dimsB != 3, "len(B.shape)==3 not supported"
         assert A.device.type == "cuda"
@@ -198,9 +204,14 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         assert A.dtype == torch.int8
         assert B.dtype == torch.int8
         assert out.dtype == dtype
-        assert SA[1] == "col32"
-        assert SB[1] in ["col_turing", "col_ampere"]
-        assert Sout[1] == "col32"
+        if HIP_ENVIRONMENT:
+            assert SA[1] == "col"
+            assert SB[1] == "col"
+            assert Sout[1] == "col"
+        else:
+            assert SA[1] == "col32"
+            assert SB[1] in ["col_turing", "col_ampere"]
+            assert Sout[1] == "col32"
         assert (
             shapeA[-1] == shapeB[-1]
         ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
@@ -215,17 +226,22 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         ptrC = get_ptr(out)
 
         k = shapeA[-1]
-        lda = ct.c_int32(m * 32)
-        if formatB == "col_turing":
-            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+        if HIP_ENVIRONMENT:
+            lda = ct.c_int32(m)
+            ldb = ct.c_int32(shapeB[0])
+            ldc = ct.c_int32(m)
         else:
-            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+            lda = ct.c_int32(m * 32)
+            if formatB == "col_turing":
+                # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+                # n = rows
+                ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+            else:
+                # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+                # n = rows
+                ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
 
-        ldc = ct.c_int32(m * 32)
+            ldc = ct.c_int32(m * 32)
         m = ct.c_int32(m)
         n = ct.c_int32(n)
         k = ct.c_int32(k)
@@ -234,7 +250,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
 
-        if formatB == "col_turing":
+        if formatB == "col_turing"  or HIP_ENVIRONMENT:
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:
@@ -246,7 +262,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
             else:
                 has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
-        if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+        if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`, `ops.hip`
             raise NotImplementedError("igemmlt not available (probably built with NO_CUBLASLT)")
 
         if has_error:

From aedfa8fa20865e1ac65caeb38009f4a90068c095 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Mon, 22 Apr 2024 19:09:04 +0000
Subject: [PATCH 122/234] Update mm_dequant for hip

---
 bitsandbytes/backends/cuda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 5757cb20f..73f4aa0aa 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -13,6 +13,7 @@
     get_colrow_absmax,
     get_ptr,
     get_transform_buffer,
+    nvidia_transform,
     is_on_gpu,
     post_call,
     pre_call,
@@ -278,6 +279,8 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     def mm_dequant(
         self, A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None
     ):
+        if HIP_ENVIRONMENT:
+            A, quant_state = nvidia_transform(A, "row", state=quant_state)
         assert A.dtype == torch.int32
         if bias is not None:
             assert bias.dtype == torch.float16

From 7835282a9313878f4a7655d26f5689d128f7bd39 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Mon, 22 Apr 2024 19:11:33 +0000
Subject: [PATCH 123/234] Update transform function for hip

---
 bitsandbytes/backends/cuda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 73f4aa0aa..026a4c9a9 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -116,6 +116,9 @@ def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=
         return out_row, out_col, row_stats, col_stats, coo_tensor
 
     def transform(self, A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
+        if HIP_ENVIRONMENT:
+            return nvidia_transform(A, to_order, from_order, out, transpose, state, ld)
+
         prev_device = pre_call(A.device)
         if state is None:
             state = (A.shape, from_order)

From 93e04b5cfa56e206a9699a60a4c35972f23e69b6 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Wed, 24 Apr 2024 18:37:02 -0700
Subject: [PATCH 124/234] Fix lint issues

---
 bitsandbytes/__init__.py                |  5 +--
 bitsandbytes/autograd/_functions.py     |  4 +--
 bitsandbytes/backends/cpu.py            | 31 +++++++-----------
 bitsandbytes/backends/cpu_xpu_common.py | 43 ++++++++++++-------------
 bitsandbytes/functional.py              |  2 +-
 bitsandbytes/nn/modules.py              |  9 ++----
 tests/test_functional.py                |  4 +--
 7 files changed, 42 insertions(+), 56 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index cc7812e4e..dc9094a2c 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
 from . import research, utils
 from .autograd._functions import (
     MatmulLtState,
@@ -13,11 +14,11 @@
     matmul_cublas,
     mm_cublas,
 )
+from .backends import register_backend
+from .backends.cpu import CPUBackend
 from .cextension import lib
 from .nn import modules
-from .backends import register_backend
 
-from .backends.cpu import CPUBackend
 register_backend("cpu", CPUBackend)
 
 if lib and lib.compiled_with_cuda:
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 67b8b6b87..08d2d9fa6 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -217,7 +217,7 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    if device == torch.device('cpu'):
+    if device == torch.device("cpu"):
         return True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
@@ -315,7 +315,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
 
         # Cast A to fp16
         A_dtype = torch.float16
-        if A.device == torch.device('cpu'):
+        if A.device == torch.device("cpu"):
             A_dtype = torch.bfloat16
         if A.dtype != A_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 5183e6485..fe77005a0 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -1,23 +1,24 @@
 import torch
+
 from .cpu_xpu_common import (
     double_quant_impl,
     igemmlt_impl,
     mm_dequant_impl,
 )
 
-
 Tensor = torch.Tensor
 
 
 def assert_on_cpu(tensors):
     on_cpu = True
     for t in tensors:
-        if t is None: continue # NULL pointers are fine
-        on_cpu &= (t.device.type == 'cpu')
+        if t is None:
+            continue  # NULL pointers are fine
+        on_cpu &= t.device.type == "cpu"
     if not on_cpu:
         raise TypeError(
-            'All input tensors need to be on CPU, but found some tensors to not be on CPU:\n' \
-            f' {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}'
+            "All input tensors need to be on CPU, but found some tensors to not be on CPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
         )
     return on_cpu
 
@@ -27,14 +28,12 @@ class CPUBackend:
     mm_dequant_output_dtype = torch.bfloat16
 
     @classmethod
-    def double_quant(
-        cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-    ):
+    def double_quant(cls, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
         assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
         return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
 
     @classmethod
-    def transform(cls, A, to_order=None, from_order='row', out=None, transpose=False, state=None, ld=None):
+    def transform(cls, A, to_order=None, from_order="row", out=None, transpose=False, state=None, ld=None):
         """
         Transform tensor A to to_order. It is originally designed for CUDA.
         For CPU, it returns the original tensor if transpose=False.
@@ -60,15 +59,7 @@ def igemmlt(cls, A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32)
 
     @classmethod
     def mm_dequant(
-        cls,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None
+        cls, A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None
     ):
         assert_on_cpu([A, row_stats, col_stats, out, bias])
         return mm_dequant_impl(
@@ -81,7 +72,7 @@ def mm_dequant(
             new_col_stats,
             bias,
             cls.mm_dequant_compute_dtype,
-            cls.mm_dequant_output_dtype
+            cls.mm_dequant_output_dtype,
         )
 
     @classmethod
@@ -108,7 +99,7 @@ def quantize_4bit(
     def dequantize_4bit(
         cls,
         A: Tensor,
-        quant_state = None,
+        quant_state=None,
         absmax: Tensor = None,
         out: Tensor = None,
         blocksize: int = 64,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c6573b6c0..c4bd25a04 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,13 +1,13 @@
-import torch
 import warnings
 
+import torch
 
 Tensor = torch.Tensor
 
 
 def _torch_version_prereq(major, minor):
-    ver_major = int(torch.__version__.split('.')[0])
-    ver_minor = int(torch.__version__.split('.')[1])
+    ver_major = int(torch.__version__.split(".")[0])
+    ver_minor = int(torch.__version__.split(".")[1])
     return ver_major * 32 + ver_minor >= major * 32 + minor
 
 
@@ -23,14 +23,12 @@ def _maybe_torch_compile(func):
 
 
 # Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
-def double_quant_impl(
-    A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0
-):
+def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     """
     Find absolute max valus of each row/column of a tensor, and symmetrically quantize it to int8.
     If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
     the original tensor and they are kept in COO format: (rows, cols, valus)
-    If threashold == 0.0, there are no outliers.
+    If threshold == 0.0, there are no outliers.
     Args:
         A The tensor to be analyzed and quantized.
         col_stats Absolute max values of each column of A. If it is not None, use the values directly.
@@ -45,6 +43,7 @@ def double_quant_impl(
         each row of A, absolute max values of each column of A, outliers in COO format
     """
     from ..functional import COOSparseTensor
+
     cols = A.shape[-1]
     if len(A.shape) == 3:
         rows = A.shape[0] * A.shape[1]
@@ -56,8 +55,8 @@ def double_quant_impl(
     coo_tensor = None
 
     def get_row_col_stats(A):
-        row_stats = torch.max(torch.abs(A), 1).values # absolute max of each row
-        col_stats = torch.max(torch.abs(A), 0).values # absolute max of each col
+        row_stats = torch.max(torch.abs(A), 1).values  # absolute max of each row
+        col_stats = torch.max(torch.abs(A), 0).values  # absolute max of each col
         return row_stats, col_stats
 
     def quant_to_int8(A, stats):
@@ -67,23 +66,23 @@ def quant_to_int8(A, stats):
         if row_stats is None or col_stats is None:
             row_stats, col_stats = get_row_col_stats(A)
     else:
-        outlier_indices = torch.abs(A) >= threshold # find outliers
-        outlier_coord = outlier_indices.nonzero() # get outlier coordinates
-        outlier_rows = outlier_coord[:, 0] # outlier row for COO sparse tensor
-        outlier_cols = outlier_coord[:, 1] # outlier column for COO sparse tensor
-        outlier_values = A[outlier_indices] # outlier values for COO sparse tensor
+        outlier_indices = torch.abs(A) >= threshold  # find outliers
+        outlier_coord = outlier_indices.nonzero()  # get outlier coordinates
+        outlier_rows = outlier_coord[:, 0]  # outlier row for COO sparse tensor
+        outlier_cols = outlier_coord[:, 1]  # outlier column for COO sparse tensor
+        outlier_values = A[outlier_indices]  # outlier values for COO sparse tensor
         coo_tensor = COOSparseTensor(
             A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
         )
         if row_stats is None or col_stats is None:
-            A[outlier_indices] = 0 # zero out outliers
+            A[outlier_indices] = 0  # zero out outliers
             row_stats, col_stats = get_row_col_stats(A)
 
     quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
     quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
 
     if coo_tensor is not None:
-        A[outlier_indices] = outlier_values # restore outliers for later use
+        A[outlier_indices] = outlier_values  # restore outliers for later use
 
     if out_row is not None:
         out_row.copy_(quant_by_row)
@@ -97,9 +96,7 @@ def quant_to_int8(A, stats):
     return out_row, out_col, row_stats.float(), col_stats.float(), coo_tensor
 
 
-def igemmlt_impl(
-    A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32
-):
+def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
     """
     Do GEMMM computation. Data type: int8 * int8 -> int32.
     Args:
@@ -122,8 +119,8 @@ def igemmlt_impl(
     dimsB = B.ndim
     shapeA = A.shape
     shapeB = B.shape
-    assert dimsA in [2, 3], 'Only two or three dimensional matrices are supported for argument A'
-    assert dimsB == 2, 'Only two dimensional matrices are supported for argument B'
+    assert dimsA in [2, 3], "Only two or three dimensional matrices are supported for argument A"
+    assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
 
     if dimsA == 2:
         m = shapeA[0]
@@ -131,7 +128,7 @@ def igemmlt_impl(
         m = shapeA[0] * shapeA[1]
     n = shapeB[0]
     k = shapeA[-1]
-    assert shapeA[-1] == shapeB[-1], f'Shapes of A and B do not match, got {shapeA} and {shapeB}'
+    assert shapeA[-1] == shapeB[-1], f"Shapes of A and B do not match, got {shapeA} and {shapeB}"
 
     # if the tensor is empty, return a transformed empty tensor with the right dimensions
     if shapeA[0] == 0 and dimsA == 2:
@@ -169,7 +166,7 @@ def mm_dequant_impl(
     new_col_stats=None,
     bias=None,
     compute_dtype=torch.float32,
-    output_dtype=torch.float32
+    output_dtype=torch.float32,
 ):
     """
     Dequant and add bias
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 54a161f7a..9828add30 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1945,7 +1945,7 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        if values.device == torch.device('cpu'):
+        if values.device == torch.device("cpu"):
             assert values.dtype in [torch.bfloat16, torch.float]
         else:
             assert values.dtype == torch.float16
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 9295e4c70..f2b5e34b8 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -590,8 +590,8 @@ def cpu(self):
         if SCBt is not None:
             del SCBt
         self.data = CB
-        setattr(self, "CB", CB)
-        setattr(self, "SCB", SCB)
+        self.CB = CB
+        self.SCB = SCB
         return self
 
     @overload
@@ -613,10 +613,7 @@ def to(self, *args, **kwargs):
 
         if device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
-        elif (
-            device.type == "cpu"
-            and self.data.dtype != torch.int8
-        ):
+        elif device.type == "cpu" and self.data.dtype != torch.int8:
             return self.cpu()
         else:
             new_param = Int8Params(
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 566d8429f..94b4222c2 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -928,7 +928,7 @@ def test_colrow_absmax(dim1, dim2, dims):
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-@pytest.mark.parametrize("device", ["cuda","cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
 def test_double_quant(dim1, dim2, device, dtype):
     if device == "cuda" and dtype == torch.bfloat16:
@@ -1208,7 +1208,7 @@ def test_overflow():
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-@pytest.mark.parametrize("device", ["cuda","cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
 def test_coo_double_quant(dim1, dim2, device, dtype):
     if device == "cuda" and dtype == torch.bfloat16:

From e1b60d3093759bca952b9aefea4ba1140c1e2340 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Thu, 25 Apr 2024 23:51:57 -0700
Subject: [PATCH 125/234] Fix backward

---
 bitsandbytes/autograd/_functions.py | 3 +++
 bitsandbytes/functional.py          | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 08d2d9fa6..7d570f28b 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -94,6 +94,9 @@ def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -
     :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
     :return: contiguous row-major tensor
     """
+    # CPU has no change on layout
+    if permuted_tensor.device.type == "cpu":
+        return permuted_tensor
     (rows, cols), (tile_rows, tile_cols) = permuted_tensor.shape, tile_indices.shape
     assert rows % tile_rows == cols % tile_cols == 0, "tensor must contain a whole number of tiles"
     tensor = permuted_tensor.reshape(-1, tile_indices.numel()).t()
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 9828add30..8fd62fd04 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1946,7 +1946,7 @@ def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
         if values.device == torch.device("cpu"):
-            assert values.dtype in [torch.bfloat16, torch.float]
+            assert values.dtype in [torch.bfloat16, torch.half, torch.float]
         else:
             assert values.dtype == torch.float16
         assert values.numel() == nnz

From 60d7560a6010eeee1bab9ef66a82eb501891b74a Mon Sep 17 00:00:00 2001
From: root <root@aus-navi3x-09.amd.com>
Date: Fri, 26 Apr 2024 20:51:29 +0000
Subject: [PATCH 126/234] adding arch detection for test_gemv_eye_4bit

---
 bitsandbytes/cextension.py | 11 ++++++++++-
 tests/test_functional.py   |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 69cf0b15f..f5924f7f9 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -19,6 +19,9 @@
 import ctypes as ct
 import logging
 import os
+import subprocess
+import re
+
 from pathlib import Path
 
 import torch
@@ -117,8 +120,14 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        result = subprocess.run(['rocminfo'], capture_output=True, text=True)
+        match = re.search(r'Name:\s+gfx(\d+)', result.stdout)
+        if match:
+            ROCM_GPU_ARCH = "gfx" + match.group(1)
+        else:
+            ROCM_GPU_ARCH = "unknown"
     else:
-        HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        HIP_ENVIRONMENT, BNB_HIP_VERSION, ROCM_GPU_ARCH = False, 0, "unknown"
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 04a898d4b..dffe724a6 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,7 +11,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
-from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT
+from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT, ROCM_GPU_ARCH
 from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
@@ -2242,6 +2242,7 @@ def test_managed():
 @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
+@pytest.mark.skipif(HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a", reason="this test is not supported on ROCm with gfx90a architecture yet")
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))

From cae33c38d56d2b7a42f1b481ef60c8218374c8be Mon Sep 17 00:00:00 2001
From: root <root@aus-navi3x-09.amd.com>
Date: Mon, 29 Apr 2024 18:57:51 +0000
Subject: [PATCH 127/234] implement get_rocm_gpu_arch

---
 bitsandbytes/cextension.py | 12 ++++--------
 bitsandbytes/cuda_specs.py | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index f5924f7f9..090c6116a 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -27,7 +27,7 @@
 import torch
 
 from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
-from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs
+from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_rocm_gpu_arch
 
 logger = logging.getLogger(__name__)
 
@@ -116,18 +116,14 @@ def get_native_library() -> BNBNativeLibrary:
     return BNBNativeLibrary(dll)
 
 
+ROCM_GPU_ARCH = get_rocm_gpu_arch()
+
 try:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
-        result = subprocess.run(['rocminfo'], capture_output=True, text=True)
-        match = re.search(r'Name:\s+gfx(\d+)', result.stdout)
-        if match:
-            ROCM_GPU_ARCH = "gfx" + match.group(1)
-        else:
-            ROCM_GPU_ARCH = "unknown"
     else:
-        HIP_ENVIRONMENT, BNB_HIP_VERSION, ROCM_GPU_ARCH = False, 0, "unknown"
+        HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index 50c139317..58c43789c 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,5 +1,8 @@
 import dataclasses
 from typing import List, Optional, Tuple
+import logging
+import subprocess
+import re
 
 import torch
 
@@ -42,3 +45,26 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
         cuda_version_string=(get_cuda_version_string()),
         cuda_version_tuple=get_cuda_version_tuple(),
     )
+
+def get_rocm_gpu_arch() -> str:
+    logger = logging.getLogger(__name__)
+    try:
+        if torch.version.hip:
+            result = subprocess.run(['rocminfo'], capture_output=True, text=True)
+            match = re.search(r'Name:\s+gfx(\d+)', result.stdout)
+            if match:
+                return "gfx" + match.group(1)
+            else:
+                return "unknown"
+        else:
+            return "unknown"
+    except Exception as e:
+        logger.error(f"Could not detect ROCm GPU architecture: {e}")
+        if torch.cuda.is_available():
+            logger.warning(
+            """
+ROCm GPU architecture detection failed despite ROCm being available.
+            """,
+        )
+        return "unknown"
+

From da53f39fba7a6516aec228b60c7ff1199b6c510b Mon Sep 17 00:00:00 2001
From: root <root@aus-navi3x-09.amd.com>
Date: Tue, 30 Apr 2024 00:01:29 +0000
Subject: [PATCH 128/234] fixing lint

---
 bitsandbytes/archive_functional.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
index dac7430ed..b050a6018 100644
--- a/bitsandbytes/archive_functional.py
+++ b/bitsandbytes/archive_functional.py
@@ -170,7 +170,9 @@ def get_instance(cls):
 dtype2bytes[torch.int8] = 1
 
 
-def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)):
+def get_paged(*shape, dtype=torch.float32, device=None):
+    if device is None:
+        torch.device("cuda", index=0)
     num_bytes = dtype2bytes[dtype] * prod(shape)
     cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
     c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
@@ -246,8 +248,8 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True):
     if gap == 0:
         return values
     else:
-        l = values.numel() // 2
-        return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist())
+        l_var = values.numel() // 2
+        return torch.Tensor(values[:l_var].tolist() + [0] * gap + values[l_var:].tolist())
 
 
 def create_normal_map(offset=0.9677083, use_extra_value=True):
@@ -679,7 +681,7 @@ def quantize_blockwise(
 
 def dequantize_blockwise(
     A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
     absmax: Tensor = None,
     code: Tensor = None,
     out: Tensor = None,
@@ -857,7 +859,7 @@ def quantize_4bit(
 
 def dequantize_fp4(
     A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -867,7 +869,7 @@ def dequantize_fp4(
 
 def dequantize_nf4(
     A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -877,7 +879,7 @@ def dequantize_nf4(
 
 def dequantize_4bit(
     A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -979,7 +981,7 @@ def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
 
 def dequantize(
     A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
     absmax: Tensor = None,
     code: Tensor = None,
     out: Tensor = None,

From ae4dcec5279ca53b8dbcc624063b3f9b03b156ec Mon Sep 17 00:00:00 2001
From: root <root@aus-navi3x-09.amd.com>
Date: Tue, 30 Apr 2024 00:22:56 +0000
Subject: [PATCH 129/234] fixing lint

---
 bitsandbytes/archive_functional.py | 2 +-
 bitsandbytes/cextension.py         | 3 ---
 bitsandbytes/cuda_specs.py         | 8 ++++----
 tests/test_functional.py           | 5 ++++-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
index b050a6018..53b0c3ce6 100644
--- a/bitsandbytes/archive_functional.py
+++ b/bitsandbytes/archive_functional.py
@@ -6,7 +6,7 @@
 from functools import reduce  # Required in Python 3
 import itertools
 import operator
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 from scipy.stats import norm
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 090c6116a..03d2cbd61 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -19,9 +19,6 @@
 import ctypes as ct
 import logging
 import os
-import subprocess
-import re
-
 from pathlib import Path
 
 import torch
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index 58c43789c..d532b738c 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,8 +1,8 @@
 import dataclasses
-from typing import List, Optional, Tuple
 import logging
-import subprocess
 import re
+import subprocess
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -62,9 +62,9 @@ def get_rocm_gpu_arch() -> str:
         logger.error(f"Could not detect ROCm GPU architecture: {e}")
         if torch.cuda.is_available():
             logger.warning(
-            """
+                """
 ROCm GPU architecture detection failed despite ROCm being available.
-            """,
+                """,
         )
         return "unknown"
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
index dffe724a6..b5a3fab35 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2242,7 +2242,10 @@ def test_managed():
 @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
-@pytest.mark.skipif(HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a", reason="this test is not supported on ROCm with gfx90a architecture yet")
+@pytest.mark.skipif(
+        HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a", 
+        reason="this test is not supported on ROCm with gfx90a architecture yet",
+)
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10
     torch.random.manual_seed(np.random.randint(0, 412424242))

From 21d5ff6066389ecbafa5963a934a362091307fbd Mon Sep 17 00:00:00 2001
From: root <root@aus-navi3x-09.amd.com>
Date: Tue, 30 Apr 2024 14:34:52 +0000
Subject: [PATCH 130/234] correct lint error

---
 bitsandbytes/archive_functional.py | 20 +++++++++-----------
 bitsandbytes/cuda_specs.py         |  8 ++++----
 tests/test_functional.py           |  4 ++--
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
index 53b0c3ce6..dac7430ed 100644
--- a/bitsandbytes/archive_functional.py
+++ b/bitsandbytes/archive_functional.py
@@ -6,7 +6,7 @@
 from functools import reduce  # Required in Python 3
 import itertools
 import operator
-from typing import Optional, Tuple
+from typing import Tuple
 
 import numpy as np
 from scipy.stats import norm
@@ -170,9 +170,7 @@ def get_instance(cls):
 dtype2bytes[torch.int8] = 1
 
 
-def get_paged(*shape, dtype=torch.float32, device=None):
-    if device is None:
-        torch.device("cuda", index=0)
+def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)):
     num_bytes = dtype2bytes[dtype] * prod(shape)
     cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
     c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
@@ -248,8 +246,8 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True):
     if gap == 0:
         return values
     else:
-        l_var = values.numel() // 2
-        return torch.Tensor(values[:l_var].tolist() + [0] * gap + values[l_var:].tolist())
+        l = values.numel() // 2
+        return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist())
 
 
 def create_normal_map(offset=0.9677083, use_extra_value=True):
@@ -681,7 +679,7 @@ def quantize_blockwise(
 
 def dequantize_blockwise(
     A: Tensor,
-    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
+    quant_state: Tuple[Tensor, Tensor] = None,
     absmax: Tensor = None,
     code: Tensor = None,
     out: Tensor = None,
@@ -859,7 +857,7 @@ def quantize_4bit(
 
 def dequantize_fp4(
     A: Tensor,
-    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
+    quant_state: Tuple[Tensor, Tensor] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -869,7 +867,7 @@ def dequantize_fp4(
 
 def dequantize_nf4(
     A: Tensor,
-    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
+    quant_state: Tuple[Tensor, Tensor] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -879,7 +877,7 @@ def dequantize_nf4(
 
 def dequantize_4bit(
     A: Tensor,
-    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
+    quant_state: Tuple[Tensor, Tensor] = None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -981,7 +979,7 @@ def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
 
 def dequantize(
     A: Tensor,
-    quant_state: Optional[Tuple[Tensor, Tensor]] = None,
+    quant_state: Tuple[Tensor, Tensor] = None,
     absmax: Tensor = None,
     code: Tensor = None,
     out: Tensor = None,
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index d532b738c..e104762e3 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -46,12 +46,13 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
         cuda_version_tuple=get_cuda_version_tuple(),
     )
 
+
 def get_rocm_gpu_arch() -> str:
     logger = logging.getLogger(__name__)
     try:
         if torch.version.hip:
-            result = subprocess.run(['rocminfo'], capture_output=True, text=True)
-            match = re.search(r'Name:\s+gfx(\d+)', result.stdout)
+            result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+            match = re.search(r"Name:\s+gfx(\d+)", result.stdout)
             if match:
                 return "gfx" + match.group(1)
             else:
@@ -65,6 +66,5 @@ def get_rocm_gpu_arch() -> str:
                 """
 ROCm GPU architecture detection failed despite ROCm being available.
                 """,
-        )
+            )
         return "unknown"
-
diff --git a/tests/test_functional.py b/tests/test_functional.py
index b5a3fab35..8acd5395d 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2243,8 +2243,8 @@ def test_managed():
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
 @pytest.mark.skipif(
-        HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a", 
-        reason="this test is not supported on ROCm with gfx90a architecture yet",
+    HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a",
+    reason="this test is not supported on ROCm with gfx90a architecture yet",
 )
 def test_gemv_eye_4bit(storage_type, dtype, double_quant):
     dims = 10

From 95c29a63ba04be0ce48bc2031861753f4e8215c4 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sun, 5 May 2024 19:42:02 -0700
Subject: [PATCH 131/234] Fix lint issue

---
 bitsandbytes/backends/cpu_xpu_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c4bd25a04..815349e46 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -25,9 +25,9 @@ def _maybe_torch_compile(func):
 # Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
 def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     """
-    Find absolute max valus of each row/column of a tensor, and symmetrically quantize it to int8.
+    Find absolute max values of each row/column of a tensor, and symmetrically quantize it to int8.
     If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
-    the original tensor and they are kept in COO format: (rows, cols, valus)
+    the original tensor and they are kept in COO format: (rows, cols, values)
     If threshold == 0.0, there are no outliers.
     Args:
         A The tensor to be analyzed and quantized.

From 765bfc83e05fd7203affd0d86b886ca40f2f26da Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Mon, 6 May 2024 14:45:25 +0000
Subject: [PATCH 132/234] update extract_outliers, quantize_4bit,
 dequantize_4bit

---
 bitsandbytes/backends/cuda.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 026a4c9a9..7cc88326d 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -322,7 +322,10 @@ def mm_dequant(
     def extract_outliers(self, A, SA, idx):
         shapeA = SA[0]
         formatA = SA[1]
-        assert formatA in ["col_turing", "col_ampere"]
+        if not HIP_ENVIRONMENT:
+            assert formatA in ["col_turing", "col_ampere"]
+        else:
+            assert formatA in ["col"]
         assert A.device.type == "cuda"
 
         out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
@@ -336,7 +339,7 @@ def extract_outliers(self, A, SA, idx):
 
         prev_device = pre_call(A.device)
 
-        if formatA == "col_turing":
+        if formatA == "col_turing" or HIP_ENVIRONMENT::
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
@@ -355,6 +358,8 @@ def quantize_4bit(
         quant_type="fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        if HIP_ENVIRONMENT:
+            blocksize = 128
         if A.device.type != "cuda":
             raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
         if quant_type not in ["fp4", "nf4"]:
@@ -372,7 +377,10 @@ def quantize_4bit(
             mod = dtype2bytes[quant_storage] * 2
             out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
 
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        if not HIP_ENVIRONMENT:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        else:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128]
 
         prev_device = pre_call(A.device)
         is_on_gpu([A, out, absmax])
@@ -446,9 +454,14 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type="fp4",
     ) -> torch.Tensor:
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        if HIP_ENVIRONMENT:
+            blocksize = 128
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if HIP_ENVIRONMENT:
+            supported_blocksizes = supported_blocksizes[:-1]
+        if blocksize not in supported_blocksizes:
             raise ValueError(
-                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+                f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}"
             )
 
         if quant_type not in ["fp4", "nf4"]:

From d00c026a9eded99f81ccc270e6e84ccd1e136240 Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Mon, 6 May 2024 15:56:50 +0000
Subject: [PATCH 133/234] minor fixes for extract_outliers

---
 bitsandbytes/backends/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 7cc88326d..2d962711a 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -339,7 +339,7 @@ def extract_outliers(self, A, SA, idx):
 
         prev_device = pre_call(A.device)
 
-        if formatA == "col_turing" or HIP_ENVIRONMENT::
+        if formatA == "col_turing" or HIP_ENVIRONMENT:
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)

From e5574bdc9ec3d605e4358cd2598599b7088e110c Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Mon, 6 May 2024 16:39:29 +0000
Subject: [PATCH 134/234] update blocksizes for quantize and dequantize

---
 bitsandbytes/backends/cuda.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 2d962711a..e04ad4708 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -353,13 +353,13 @@ def quantize_4bit(
         A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize=64,
+        blocksize: Optional[int] = None,
         compress_statistics=False,
         quant_type="fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        if HIP_ENVIRONMENT:
-            blocksize = 128
+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
         if A.device.type != "cuda":
             raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
         if quant_type not in ["fp4", "nf4"]:
@@ -451,11 +451,11 @@ def dequantize_4bit(
         quant_state: Optional[QuantState] = None,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
+        blocksize: Optional[int] = None,
         quant_type="fp4",
     ) -> torch.Tensor:
-        if HIP_ENVIRONMENT:
-            blocksize = 128
+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
         supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
         if HIP_ENVIRONMENT:
             supported_blocksizes = supported_blocksizes[:-1]

From b0dec0a55c3464ed97a8cadeb1ba0d43f2704c25 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Tue, 7 May 2024 00:48:29 -0700
Subject: [PATCH 135/234] Update bitsandbytes/backends/cpu_xpu_common.py

---
 bitsandbytes/backends/cpu_xpu_common.py | 26 +++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 815349e46..5a0f0f9d5 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -2,6 +2,16 @@
 
 import torch
 
+try:
+    # to support Intel CPU/GPU (XPU) backend
+    import intel_extension_for_pytorch as ipex
+    ipex_cpu = ipex if ipex._C._has_cpu() else None
+    ipex_xpu = ipex if ipex._C._has_xpu() else None
+except:
+    ipex_cpu = None
+    ipex_xpu = None
+
+
 Tensor = torch.Tensor
 
 
@@ -11,6 +21,22 @@ def _torch_version_prereq(major, minor):
     return ver_major * 32 + ver_minor >= major * 32 + minor
 
 
+def _ipex_cpu_version_prereq(major, minor):
+    if ipex_cpu is not None:
+        ver_major = ipex_cpu.__version__.split(".")[0]
+        ver_minor = ipex_cpu.__version__.split(".")[1]
+        return int(ver_major) * 32 + int(ver_minor) >= major * 32 + minor
+    return False
+
+
+def _ipex_xpu_version_prereq(major, minor):
+    if ipex_xpu is not None:
+        ver_major = ipex_xpu.__version__.split(".")[0]
+        ver_minor = ipex_xpu.__version__.split(".")[1]
+        return int(ver_major) * 32 + int(ver_minor) >= major * 32 + minor
+    return False
+
+
 def _maybe_torch_compile(func):
     # torch.compile requires pytorch >= 2.0
     if _torch_version_prereq(2, 0):

From 295bb973c301bfbb5d51aed5a2b79e955840296b Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Tue, 7 May 2024 02:40:01 -0700
Subject: [PATCH 136/234] Fix lint issue

---
 bitsandbytes/backends/cpu.py            | 1 -
 bitsandbytes/backends/cpu_xpu_common.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 97e6580ed..d6a9192e4 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -5,7 +5,6 @@
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
-
 from .cpu_xpu_common import (
     double_quant_impl,
     igemmlt_impl,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 5a0f0f9d5..ceac893b4 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -5,6 +5,7 @@
 try:
     # to support Intel CPU/GPU (XPU) backend
     import intel_extension_for_pytorch as ipex
+
     ipex_cpu = ipex if ipex._C._has_cpu() else None
     ipex_xpu = ipex if ipex._C._has_xpu() else None
 except:

From 7ab3a054101d654893bffd280320de0cbed1152c Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Tue, 7 May 2024 13:01:52 +0000
Subject: [PATCH 137/234] update reg expression for detecting arch

---
 bitsandbytes/cuda_specs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index e104762e3..0afecd3ea 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -52,7 +52,7 @@ def get_rocm_gpu_arch() -> str:
     try:
         if torch.version.hip:
             result = subprocess.run(["rocminfo"], capture_output=True, text=True)
-            match = re.search(r"Name:\s+gfx(\d+)", result.stdout)
+            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
             if match:
                 return "gfx" + match.group(1)
             else:

From 9cd1d8c751c0620a67006f5bed6c20953dc95cba Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Tue, 7 May 2024 13:21:45 +0000
Subject: [PATCH 138/234] linter updates

---
 bitsandbytes/backends/cuda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index e04ad4708..a449b493c 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from bitsandbytes.cextension import lib, HIP_ENVIRONMENT
+from bitsandbytes.cextension import HIP_ENVIRONMENT, lib
 from bitsandbytes.functional import (
     CUBLAS_Context,
     coo_zeros,
@@ -13,8 +13,8 @@
     get_colrow_absmax,
     get_ptr,
     get_transform_buffer,
-    nvidia_transform,
     is_on_gpu,
+    nvidia_transform,
     post_call,
     pre_call,
     prod,
@@ -254,7 +254,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
 
-        if formatB == "col_turing"  or HIP_ENVIRONMENT:
+        if formatB == "col_turing" or HIP_ENVIRONMENT:
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:

From 37b05821a5decab33c67527b9e365cbf0fbdf2f0 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Tue, 7 May 2024 06:34:32 -0700
Subject: [PATCH 139/234] Fix lint issue

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index ceac893b4..f4e5ed3ec 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -8,7 +8,7 @@
 
     ipex_cpu = ipex if ipex._C._has_cpu() else None
     ipex_xpu = ipex if ipex._C._has_xpu() else None
-except:
+except BaseException:
     ipex_cpu = None
     ipex_xpu = None
 

From 09cc153dea939f23747bea622560c84b5a95183f Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Wed, 8 May 2024 02:10:49 -0700
Subject: [PATCH 140/234] Support NF4 on CPU backend

---
 bitsandbytes/autograd/_functions.py     |   3 +-
 bitsandbytes/backends/cpu.py            |  15 +-
 bitsandbytes/backends/cpu_xpu_common.py | 266 +++++++++++++++++++++++-
 bitsandbytes/nn/modules.py              |   7 +-
 4 files changed, 284 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 7d570f28b..6dea211ff 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -572,7 +572,8 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False:
+    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
+        # CPU backend does not require A to be a vector
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index d6a9192e4..a5e123e62 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -9,6 +9,9 @@
     double_quant_impl,
     igemmlt_impl,
     mm_dequant_impl,
+    quantize_4bit_impl,
+    dequantize_4bit_impl,
+    gemm_4bit_impl,
 )
 
 Tensor = torch.Tensor
@@ -132,7 +135,8 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, absmax, out])
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
         self,
@@ -143,7 +147,8 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, absmax, out])
+        return dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 
     def gemv_4bit(
         self,
@@ -154,7 +159,11 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index f4e5ed3ec..078b81680 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,6 +1,12 @@
 import warnings
-
 import torch
+from typing import Optional
+from bitsandbytes.functional import (
+    get_4bit_type,
+    quantize_blockwise,
+    dequantize_blockwise,
+    QuantState,
+)
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -228,3 +234,261 @@ def mm_dequant_impl(
         out = out + bias.to(compute_dtype)
     out = out.to(output_dtype)
     return out
+
+
+NF4_QUANT_TABLE = [
+    -1.0 - 1e-2,           # 0b0000
+    -0.8480964004993439,   # 0b0001
+    -0.6106329262256622,   # 0b0010
+    -0.4599952697753906,   # 0b0011
+    -0.33967943489551544,  # 0b0100
+    -0.23460740596055984,  # 0b0101
+    -0.13791173323988914,  # 0b0110
+    -0.045525018125772476, # 0b0111
+    0.03979014977812767,   # 0b1000
+    0.1202552504837513,    # 0b1001
+    0.2035212516784668,    # 0b1010
+    0.2920137718319893,    # 0b1011
+    0.3893125355243683,    # 0b1100
+    0.5016634166240692,    # 0b1101
+    0.6427869200706482,    # 0b1110
+    0.8614784181118011,    # 0b1111
+]
+
+
+# It's faster not to use torch.compile
+def quantize_4bit_impl(
+    A: Tensor,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    if quant_type != "nf4":
+        raise NotImplementedError(
+            f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU."
+        )
+    n = A.numel()
+    input_shape = A.shape
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+
+    if absmax is None:
+        absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+
+    if out is None:
+        out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    rem = n % blocksize
+    has_rem = rem > 0
+
+    # Scale tensor to [-1, 1]
+    A_reshaped = A.reshape(n)
+    A_com = A_reshaped[:n - rem]
+    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+    absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].view(-1, 1)), -1, 1)
+    scaled_A = scaled_A.reshape(-1)
+    if has_rem:
+        absmax[-1] = torch.abs(A_reshaped[n - rem:]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem:] * (1 / absmax[-1]), -1, 1)
+        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+    # map [-1, 1] to nf4
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    for i in range(len(NF4_QUANT_TABLE)):
+        out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    if out_uint8.size(-1) % 2:
+        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    code = get_4bit_type(quant_type, device=A.device)
+
+    if compress_statistics:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+    else:
+        state = QuantState(
+            absmax=absmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+        state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
+            out.reshape([input_shape[0], input_shape[1] // 2]),
+            ipex_cpu.quantization.WoqWeightDtype.NF4,
+            input_shape, # weight shape
+            absmax.view(input_shape[0], input_shape[1] // blocksize), # scales
+            None, # zero_points
+            None, # bias
+            None, # g_idx
+            None, # batch_size
+            blocksize,
+            int(ipex_cpu.quantization.WoqLowpMode.BF16),
+            -1, # act_quant_mode
+        )
+
+    return out, state
+
+
+@_maybe_torch_compile
+def dequantize_4bit_impl(
+    A: Tensor,
+    quant_state = None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Dequantizes FP4 blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input 8-bit tensor (packed 4-bit values).
+    quant_state : QuantState
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        Dequantized output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor.
+    """
+
+    if quant_state is None:
+        assert absmax is not None and out is not None
+
+        quant_state = QuantState(
+            absmax=absmax,
+            shape=out.shape,
+            dtype=out.dtype,
+            blocksize=blocksize,
+            quant_type=quant_type,
+        )
+
+    else:
+        absmax = quant_state.absmax
+
+    if quant_state.quant_type != "nf4":
+        raise NotImplementedError(
+            f"4-bit quantization data type {quant_state.quant_type} is not implemented for CPU/XPU."
+        )
+
+    if quant_state.nested:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+
+    if out is None:
+        out = torch.empty(
+            quant_state.shape, dtype=quant_state.dtype, device=A.device
+        )
+
+    n = out.numel()
+    # Map nf4 to [-1, 1]
+    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
+    out_uint8[::2] = A.bitwise_and(0xF)
+    out_uint8[1::2] = A.bitwise_right_shift(4)
+    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
+    for i in range(len(quant_state.code)):
+        out_dq[out_uint8 == i] = quant_state.code[i]
+
+    # Apply scales
+    if out_dq.numel() != n:
+        assert out_dq.numel() == n + 1
+        out_dq = torch.narrow(out_dq, 0, 0, n)
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+    rem = n % blocksize
+    has_rem = rem > 0
+    out_reshaped = out.reshape(-1)
+    out_reshaped[:n - rem] = (out_dq[:n - rem].view(-1, blocksize) * absmax[:blocks - has_rem].view(-1, 1)).reshape(-1)
+    if has_rem:
+        out_reshaped[n - rem:] = out_dq[n - rem:] * absmax[-1]
+
+    # take transpose here because weight is transposed (again) for computation
+    return out.t()
+
+
+# Do not need torch.compile here as we are calling torch/ipex kernel
+def gemm_4bit_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    transposed_A=False,
+    transposed_B=False,
+    state: QuantState = None,
+) -> torch.Tensor:
+    """
+    Matrix-matrix multiplication with 4-bit quantization.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The first input tensor. Usually the activation tensor.
+    B : torch.Tensor
+        The second input tensor. Usually the weight tensor.
+    out : torch.Tensor
+        The output tensor.
+    transposed_A : bool
+        Whether A is transposed
+    transposed_B : bool
+        Whether B is transposed
+    state : QuantState
+        Contains quantization info, such as blocksize and dtype
+
+    Returns
+    -------
+    torch.Tensor:
+        GEMM output tensor.
+    """
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and hasattr(state, "op_context"):
+        assert state.op_context is not None
+        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    else:
+        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
+        output = torch.matmul(A, dqB)
+    if out is not None:
+        out.copy_(output)
+    else:
+        out = output
+    return out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 7e9ab8d05..d52cb4847 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -285,7 +285,7 @@ def from_prequantized(
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -303,6 +303,9 @@ def _quantize(self, device):
     def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
+    def cpu(self, non_blocking: bool = False):
+        return self.to(device="cpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -320,7 +323,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type == "cuda" and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:

From 06f6b2513390c0c662af3e32d7185430bacdaa24 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:29:59 +0000
Subject: [PATCH 141/234] skip linear no igemmlt test

---
 tests/test_linear8bitlt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index ca52f312e..2a4bd02e2 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -40,6 +40,7 @@ def test_layout_exact_match():
         assert torch.all(torch.eq(restored_x, x))
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_linear_no_igemmlt():
     linear = torch.nn.Linear(1024, 3072)
     x = torch.randn(3, 1024, dtype=torch.half)

From 2359452dbc49d0422da7cecd3cbf7855ccf01da5 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:30:31 +0000
Subject: [PATCH 142/234] Remove archive functional file

---
 bitsandbytes/archive_functional.py | 2466 ----------------------------
 1 file changed, 2466 deletions(-)
 delete mode 100644 bitsandbytes/archive_functional.py

diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
deleted file mode 100644
index dac7430ed..000000000
--- a/bitsandbytes/archive_functional.py
+++ /dev/null
@@ -1,2466 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import ctypes as ct
-from functools import reduce  # Required in Python 3
-import itertools
-import operator
-from typing import Tuple
-
-import numpy as np
-from scipy.stats import norm
-import torch
-from torch import Tensor
-
-from .cextension import COMPILED_WITH_CUDA, lib
-
-
-# math.prod not compatible with python < 3.8
-def prod(iterable):
-    return reduce(operator.mul, iterable, 1)
-
-
-name2qmap = {}
-
-if COMPILED_WITH_CUDA:
-    """C FUNCTIONS FOR OPTIMIZERS"""
-    str2optimizer32bit = {}
-    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16)  # , lib.cadam32bit_grad_bf16)
-    str2optimizer32bit["momentum"] = (
-        lib.cmomentum32bit_grad_32,
-        lib.cmomentum32bit_grad_16,
-    )
-    str2optimizer32bit["rmsprop"] = (
-        lib.crmsprop32bit_grad_32,
-        lib.crmsprop32bit_grad_16,
-    )
-    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16)  # , lib.clion32bit_grad_bf16)
-    str2optimizer32bit["adagrad"] = (
-        lib.cadagrad32bit_grad_32,
-        lib.cadagrad32bit_grad_16,
-    )
-
-    str2optimizer8bit = {}
-    str2optimizer8bit["adam"] = (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    )
-    str2optimizer8bit["momentum"] = (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    )
-    str2optimizer8bit["rmsprop"] = (
-        lib.crmsprop_static_8bit_grad_32,
-        lib.crmsprop_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lion"] = (
-        lib.clion_static_8bit_grad_32,
-        lib.clion_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lamb"] = (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lars"] = (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    )
-
-    str2optimizer8bit_blockwise = {}
-    str2optimizer8bit_blockwise["adam"] = (
-        lib.cadam_8bit_blockwise_grad_fp32,
-        lib.cadam_8bit_blockwise_grad_fp16,
-        # lib.cadam_8bit_blockwise_grad_bf16,
-    )
-    str2optimizer8bit_blockwise["momentum"] = (
-        lib.cmomentum_8bit_blockwise_grad_fp32,
-        lib.cmomentum_8bit_blockwise_grad_fp16,
-    )
-    str2optimizer8bit_blockwise["rmsprop"] = (
-        lib.crmsprop_8bit_blockwise_grad_fp32,
-        lib.crmsprop_8bit_blockwise_grad_fp16,
-    )
-    str2optimizer8bit_blockwise["lion"] = (
-        lib.clion_8bit_blockwise_grad_fp32,
-        lib.clion_8bit_blockwise_grad_fp16,
-        # lib.clion_8bit_blockwise_grad_bf16,
-    )
-    str2optimizer8bit_blockwise["adagrad"] = (
-        lib.cadagrad_8bit_blockwise_grad_fp32,
-        lib.cadagrad_8bit_blockwise_grad_fp16,
-    )
-
-
-class GlobalPageManager:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        self.paged_tensors = []
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-    def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be the
-        # ones that are used first, so swap them in last
-        # in the case they are evicted again
-        for t in self.paged_tensors[::-1]:
-            prefetch_tensor(t, to_cpu)
-
-
-class CUBLAS_Context:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        self.context = {}
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-    def get_context(self, device):
-        if device.index not in self.context:
-            prev_device = torch.cuda.current_device()
-            torch.cuda.set_device(device)
-            self.context[device.index] = ct.c_void_p(lib.get_context())
-            torch.cuda.set_device(prev_device)
-        return self.context[device.index]
-
-
-class Cusparse_Context:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        # self.context = ct.c_void_p(lib.get_cusparse())
-        if torch.version.cuda:
-            self.context = ct.c_void_p(lib.get_cusparse())
-        elif torch.version.hip:
-            self.context = ct.c_void_p(lib.get_hipsparse())
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-
-dtype2bytes = {}
-dtype2bytes[torch.float32] = 4
-dtype2bytes[torch.float16] = 2
-dtype2bytes[torch.bfloat16] = 2
-dtype2bytes[torch.uint8] = 1
-dtype2bytes[torch.int8] = 1
-
-
-def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)):
-    num_bytes = dtype2bytes[dtype] * prod(shape)
-    cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
-    c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
-    new_array = np.ctypeslib.as_array(c_ptr, shape=shape)
-    out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)).view(shape)
-    out.is_paged = True
-    out.page_deviceid = device.index
-    return out
-
-
-def prefetch_tensor(A, to_cpu=False):
-    assert A.is_paged, "Only paged tensors can be prefetched!"
-    if to_cpu:
-        deviceid = -1
-    else:
-        deviceid = A.page_deviceid
-
-    num_bytes = dtype2bytes[A.dtype] * A.numel()
-    lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid))
-
-
-def elementwise_func(func_name, A, B, value, prefetch=True):
-    func = None
-    if A.dtype == torch.float32:
-        func = getattr(lib, f"c{func_name}_fp32", None)
-        cvalue = ct.c_float(value)
-    elif A.dtype == torch.uint8:
-        func = getattr(lib, f"c{func_name}_uint8", None)
-        cvalue = ct.c_uint8(value)
-
-    if func is None:
-        raise NotImplementedError(f"Function not implemented: {func_name}")
-
-    is_managed = getattr(A, "is_managed", False)
-    if is_managed and prefetch:
-        prefetch_tensor(A)
-        if B is not None:
-            prefetch_tensor(B)
-
-    func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel()))
-    if A.is_paged or B.is_paged:
-        # paged function are fully asynchronous
-        # if we return from this function, we want to the tensor
-        # to be in the correct state, that is the final state after the
-        # operation occurred. So we synchronize.
-        torch.cuda.synchronize()
-
-
-def fill(A, value, device=None, prefetch=True):
-    elementwise_func("fill", A, None, value)
-
-
-def arange(A, device=None):
-    elementwise_func("arange", A, None, 0)
-
-
-def _mul(A, B, device=None):
-    elementwise_func("_mul", A, B, 0)
-
-
-def create_linear_map(signed=True, total_bits=8, add_zero=True):
-    sign = -1.0 if signed else 0.0
-    total_values = 2**total_bits
-    if add_zero or total_bits < 8:
-        # add a zero
-        # since we simulate less bits by having zeros in the data type, we
-        # we need to center the quantization around zero and as such lose
-        # a single value
-        total_values = 2**total_bits if not signed else 2**total_bits - 1
-
-    values = torch.linspace(sign, 1.0, total_values)
-    gap = 256 - values.numel()
-    if gap == 0:
-        return values
-    else:
-        l = values.numel() // 2
-        return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist())
-
-
-def create_normal_map(offset=0.9677083, use_extra_value=True):
-    if use_extra_value:
-        # one more positive value, this is an asymmetric type
-        v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist()
-        v2 = [0] * (256 - 15)  ## we have 15 non-zero values in this data type
-        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
-        v = v1 + v2 + v3
-    else:
-        v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist()
-        v2 = [0] * (256 - 14)  ## we have 14 non-zero values in this data type
-        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
-        v = v1 + v2 + v3
-
-    values = torch.Tensor(v)
-    values = values.sort().values
-    values /= values.max()
-    assert values.numel() == 256
-    return values
-
-
-def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8):
-    e = exponent_bits
-    p = precision_bits
-    has_sign = 1 if signed else 0
-    assert e + p == total_bits - has_sign
-    # the exponent is biased to 2^(e-1) -1 == 0
-    evalues = []
-    pvalues = []
-    for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)):
-        evalues.append(2**val)
-
-    values = []
-    lst = list(itertools.product([0, 1], repeat=precision_bits))
-    # for ev in evalues:
-    bias = 2 ** (exponent_bits - 1)
-    for evalue in range(2 ** (exponent_bits)):
-        for bit_pattern in lst:
-            value = 1 if evalue != 0 else 0
-            for i, pval in enumerate(list(bit_pattern)):
-                value += pval * (2 ** -(i + 1))
-            if evalue == 0:
-                # subnormals
-                value = value * 2**-(bias)
-            else:
-                # normals
-                value = value * 2 ** -(evalue - bias - 1)
-            values.append(value)
-            if signed:
-                values.append(-value)
-
-    assert len(values) == 2**total_bits
-    values.sort()
-    if total_bits < 8:
-        gap = 256 - len(values)
-        for i in range(gap):
-            values.append(0)
-    values.sort()
-    code = torch.Tensor(values)
-    code /= code.max()
-
-    return code
-
-
-def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
-    """
-    Creates the dynamic quantiztion map.
-
-    The dynamic data type is made up of a dynamic exponent and
-    fraction. As the exponent increase from 0 to -7 the number
-    of bits available for the fraction shrinks.
-
-    This is a generalization of the dynamic type where a certain
-    number of the bits and be reserved for the linear quantization
-    region (the fraction). n determines the maximum number of
-    exponent bits.
-
-    For more details see
-    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
-    """
-
-    data = []
-    # these are additional items that come from the case
-    # where all the exponent bits are zero and no
-    # indicator bit is present
-    non_sign_bits = total_bits - (1 if signed else 0)
-    additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
-    if not signed:
-        additional_items = 2 * additional_items
-    for i in range(max_exponent_bits):
-        fraction_items = int(
-            2 ** (i + non_sign_bits - max_exponent_bits) + 1
-            if signed
-            else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1
-        )
-        boundaries = torch.linspace(0.1, 1, fraction_items)
-        means = (boundaries[:-1] + boundaries[1:]) / 2.0
-        data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-        if signed:
-            data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-
-        if additional_items > 0:
-            boundaries = torch.linspace(0.1, 1, additional_items + 1)
-            means = (boundaries[:-1] + boundaries[1:]) / 2.0
-            data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-            if signed:
-                data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-
-    data.append(0)
-    data.append(1.0)
-
-    gap = 256 - len(data)
-    for i in range(gap):
-        data.append(0)
-
-    data.sort()
-    return Tensor(data)
-
-
-def create_quantile_map(A, total_bits=8):
-    q = estimate_quantiles(A, num_quantiles=2**total_bits - 1)
-    q = q.tolist()
-    q.append(0)
-
-    gap = 256 - len(q)
-    for i in range(gap):
-        q.append(0)
-
-    q.sort()
-
-    q = Tensor(q)
-    q = q / q.abs().max()
-    return q
-
-
-def get_special_format_str():
-    if not torch.cuda.is_available():
-        return "col_turing"
-    major, _minor = torch.cuda.get_device_capability()
-    if major <= 7:
-        return "col_turing"
-    if major == 8:
-        return "col_ampere"
-    return "col_turing"
-
-
-def is_on_gpu(tensors):
-    on_gpu = True
-    gpu_ids = set()
-    for t in tensors:
-        if t is None:
-            continue  # NULL pointers are fine
-        is_paged = getattr(t, "is_paged", False)
-        on_gpu &= t.device.type == "cuda" or is_paged
-        if not is_paged:
-            gpu_ids.add(t.device.index)
-    if not on_gpu:
-        raise TypeError(
-            f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}"
-        )
-    if len(gpu_ids) > 1:
-        raise TypeError(
-            f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}"
-        )
-    return on_gpu
-
-
-def get_ptr(A: Tensor) -> ct.c_void_p:
-    """
-    Get the ctypes pointer from a PyTorch Tensor.
-
-    Parameters
-    ----------
-    A : torch.tensor
-        The PyTorch tensor.
-
-    Returns
-    -------
-    ctypes.c_void_p
-    """
-    if A is None:
-        return None
-    else:
-        return ct.c_void_p(A.data.data_ptr())
-
-
-def pre_call(device):
-    prev_device = torch.cuda.current_device()
-    torch.cuda.set_device(device)
-    return prev_device
-
-
-def post_call(prev_device):
-    torch.cuda.set_device(prev_device)
-
-
-def get_transform_func(dtype, orderA, orderOut, transpose=False):
-    name = f'ctransform_{(8 if dtype == torch.int8 else 32)}_{orderA}_to_{orderOut}_{"t" if transpose else "n"}'
-    if not hasattr(lib, name):
-        print(name)
-        raise ValueError(
-            f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}"
-        )
-    else:
-        return getattr(lib, name)
-
-
-def get_transform_buffer(shape, dtype, device, to_order, from_order="row", transpose=False):
-    # init_func = torch.empty
-    init_func = torch.zeros
-    dims = len(shape)
-
-    if dims == 2:
-        rows = shape[0]
-    elif dims == 3:
-        rows = shape[0] * shape[1]
-    cols = shape[-1]
-
-    state = (shape, to_order)
-    if transpose:
-        # swap dims
-        tmp = rows
-        rows = cols
-        cols = tmp
-        state = (shape[::-1], to_order)
-
-    if to_order == "row" or to_order == "col":
-        return init_func(shape, dtype=dtype, device=device), state
-    elif to_order == "col32":
-        # blocks of 32 columns (padded)
-        cols = 32 * ((cols + 31) // 32)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    elif to_order == "col_turing":
-        # blocks of 32 columns and 8 rows
-        cols = 32 * ((cols + 31) // 32)
-        rows = 8 * ((rows + 7) // 8)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    elif to_order == "col_ampere":
-        # blocks of 32 columns and 32 rows
-        cols = 32 * ((cols + 31) // 32)
-        rows = 32 * ((rows + 31) // 32)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    else:
-        raise NotImplementedError(f"To_order not supported: {to_order}")
-
-
-def nvidia_transform(
-    A,
-    to_order,
-    from_order="row",
-    out=None,
-    transpose=False,
-    state=None,
-    ld=None,
-):
-    if state is None:
-        state = (A.shape, from_order)
-    else:
-        from_order = state[1]
-    if out is None:
-        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1])
-    else:
-        new_state = (state[1], to_order)
-    func = get_transform_func(A.dtype, from_order, to_order, transpose)
-
-    shape = state[0]
-    if len(shape) == 2:
-        dim1 = ct.c_int32(shape[0])
-        dim2 = ct.c_int32(shape[1])
-    elif ld is not None:
-        n = prod(shape)
-        dim1 = prod([shape[i] for i in ld])
-        dim2 = ct.c_int32(n // dim1)
-        dim1 = ct.c_int32(dim1)
-    else:
-        dim1 = ct.c_int32(shape[0] * shape[1])
-        dim2 = ct.c_int32(shape[2])
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-    func(ptr, get_ptr(A), get_ptr(out), dim1, dim2)
-
-    return out, new_state
-
-
-def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor:
-    """
-    Estimates 256 equidistant quantiles on the input tensor eCDF.
-
-    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
-    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
-    and the extreme quantiles close to 0 and 1 have high variance / large estimation
-    errors. These large errors can be avoided by using the offset variable which trims
-    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
-    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
-    usually has a much lower error but is not a minimum entropy encoding. Given an offset
-    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor. Any shape.
-    out : torch.Tensor
-        Tensor with the 256 estimated quantiles.
-    offset : float
-        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
-    num_quantiles : int
-        The number of equally spaced quantiles.
-
-    Returns
-    -------
-    torch.Tensor:
-        The 256 quantiles in float32 datatype.
-    """
-    if A.numel() < 256:
-        raise NotImplementedError(
-            f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values."
-        )
-    if num_quantiles > 256:
-        raise NotImplementedError(
-            f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}"
-        )
-    if num_quantiles < 256 and offset == 1 / (512):
-        # override default arguments
-        offset = 1 / (2 * num_quantiles)
-
-    if out is None:
-        out = torch.zeros((256,), dtype=torch.float32, device=A.device)
-    is_on_gpu([A, out])
-    device = pre_call(A.device)
-    if A.dtype == torch.float32:
-        lib.cestimate_quantiles_fp32(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
-    elif A.dtype == torch.float16:
-        lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
-    else:
-        raise NotImplementedError(f"Not supported data type {A.dtype}")
-    post_call(device)
-
-    if num_quantiles < 256:
-        step = round(256 / num_quantiles)
-        idx = torch.linspace(0, 255, num_quantiles).long().to(A.device)
-        out = out[idx]
-
-    return out
-
-
-def quantize_blockwise(
-    A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False
-) -> Tensor:
-    """
-    Quantize tensor A in blocks of size 4096 values.
-
-    Quantizes tensor A by dividing it into blocks of 4096 values.
-    Then the absolute maximum value within these blocks is calculated
-    for the non-linear quantization.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
-
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor.
-    tuple(torch.Tensor, torch.Tensor):
-        The quantization state to undo the quantization.
-    """
-
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-
-    if absmax is None:
-        n = A.numel()
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
-        absmax = torch.zeros((blocks,), device=A.device)
-
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.uint8)
-
-    if A.device.type != "cpu":
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-        cblocksize = ct.c_int32(blocksize)
-        prev_device = pre_call(A.device)
-        code = code.to(A.device)
-        is_on_gpu([code, A, out, absmax])
-        if A.dtype == torch.float32:
-            lib.cquantize_blockwise_fp32(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
-            )
-        elif A.dtype == torch.float16:
-            lib.cquantize_blockwise_fp16(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
-            )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-    else:
-        # cpu
-        code = code.cpu()
-        lib.cquantize_blockwise_cpu_fp32(
-            get_ptr(code),
-            get_ptr(A),
-            get_ptr(absmax),
-            get_ptr(out),
-            ct.c_longlong(blocksize),
-            ct.c_longlong(A.numel()),
-        )
-
-    if nested:
-        offset = absmax.mean()
-        absmax -= offset
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
-        state = [qabsmax, code, blocksize, nested, offset, state2]
-    else:
-        state = [absmax, code, blocksize, nested, None, None]
-
-    return out, state
-
-
-def dequantize_blockwise(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    code: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 4096,
-    nested=False,
-) -> Tensor:
-    """
-    Dequantizes blockwise quantized values.
-
-    Dequantizes the tensor A with maximum absolute values absmax in
-    blocks of size 4096.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor.
-    quant_state : tuple(torch.Tensor, torch.Tensor)
-        Tuple of code and absmax values.
-    absmax : torch.Tensor
-        The absmax values.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        Dequantized output tensor (default: float32)
-
-
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor (default: float32)
-    """
-    assert quant_state is not None or absmax is not None
-    if code is None and quant_state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.float32)
-
-    if quant_state is None:
-        quant_state = (absmax, code, blocksize)
-        assert absmax is not None and out is not None
-    else:
-        absmax, code, blocksize, nested, offset, state2 = quant_state
-        if nested:
-            absmax = dequantize_blockwise(absmax, state2)
-            absmax += offset
-
-    if A.device.type != "cpu":
-        device = pre_call(A.device)
-        code = code.to(A.device)
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(
-                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
-            )
-        is_on_gpu([A, absmax, out])
-        if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
-            )
-        elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
-            )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-    else:
-        code = code.cpu()
-        lib.cdequantize_blockwise_cpu_fp32(
-            get_ptr(quant_state[1]),
-            get_ptr(A),
-            get_ptr(quant_state[0]),
-            get_ptr(out),
-            ct.c_longlong(blocksize),
-            ct.c_longlong(A.numel()),
-        )
-
-    return out
-
-
-def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4")
-
-
-def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4")
-
-
-def quantize_4bit(
-    A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type="fp4"
-) -> Tensor:
-    """
-    Quantize tensor A in blocks of 4-bit values.
-
-    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor with packed 4-bit values.
-    tuple(torch.Tensor, torch.Size, torch.dtype, int):
-        The quantization state to undo the quantization.
-    """
-    if A.device.type != "cuda":
-        raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
-    if quant_type not in ["fp4", "nf4"]:
-        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
-
-    n = A.numel()
-    input_shape = A.shape
-
-    if absmax is None:
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
-        absmax = torch.zeros((blocks,), device=A.device)
-
-    if out is None:
-        out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device)
-
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, out, absmax])
-
-    if A.dtype == torch.float32:
-        if quant_type == "fp4":
-            lib.cquantize_blockwise_fp32_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cquantize_blockwise_fp32_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-    elif A.dtype == torch.float16:
-        if quant_type == "fp4":
-            lib.cquantize_blockwise_fp16_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cquantize_blockwise_fp16_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    if compress_statistics:
-        offset = absmax.mean()
-        absmax -= offset
-        # code = create_custom_map().to(absmax.device)
-        # qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
-        del absmax
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-    else:
-        state = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-
-    return out, state
-
-
-def dequantize_fp4(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
-
-
-def dequantize_nf4(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
-
-
-def dequantize_4bit(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-    quant_type="fp4",
-) -> Tensor:
-    """
-    Dequantizes FP4 blockwise quantized values.
-
-    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor (packed 4-bit values).
-    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
-        Tuple of absmax values, original tensor shape and original dtype.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        Dequantized output tensor.
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor.
-    """
-    if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-        raise ValueError(
-            f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
-        )
-    if quant_type not in ["fp4", "nf4"]:
-        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
-
-    if quant_state is None:
-        assert absmax is not None and out is not None
-        shape = out.shape
-        dtype = out.dtype
-    else:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state
-
-    if compressed_stats is not None:
-        offset, state2 = compressed_stats
-        absmax = dequantize_blockwise(absmax, state2)
-        absmax += offset
-
-    if out is None:
-        out = torch.empty(shape, dtype=dtype, device=A.device)
-
-    n = out.numel()
-
-    device = pre_call(A.device)
-    is_on_gpu([A, absmax, out])
-    if out.dtype == torch.float32:
-        if quant_type == "fp4":
-            lib.cdequantize_blockwise_fp32_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cdequantize_blockwise_fp32_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-    elif out.dtype == torch.float16:
-        if quant_type == "fp4":
-            lib.cdequantize_blockwise_fp16_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cdequantize_blockwise_fp16_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    is_transposed = True if A.shape[0] == 1 else False
-    if is_transposed:
-        return out.t()
-    else:
-        return out
-
-
-def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    absmax = torch.abs(A).max()
-    inp = A / absmax
-    out = quantize_no_absmax(inp, code, out)
-    return out, (absmax, code)
-
-
-def dequantize(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    code: Tensor = None,
-    out: Tensor = None,
-) -> Tensor:
-    assert quant_state is not None or absmax is not None
-    if code is None and quant_state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    if quant_state is None:
-        quant_state = (absmax, code)
-    out = dequantize_no_absmax(A, quant_state[1], out)
-    return out * quant_state[0]
-
-
-def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    """
-    Quantizes input tensor to 8-bit.
-
-    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
-    `out` using the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor, optional
-        The output tensor. Needs to be of type byte.
-
-    Returns
-    -------
-    torch.Tensor:
-        Quantized 8-bit tensor.
-    """
-    prev_device = pre_call(A.device)
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.uint8)
-    is_on_gpu([A, out])
-    lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
-    post_call(prev_device)
-    return out
-
-
-def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    """
-    Dequantizes the 8-bit tensor to 32-bit.
-
-    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
-    the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The 8-bit input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        The 32-bit output tensor.
-
-    Returns
-    -------
-    torch.Tensor:
-        32-bit output tensor.
-    """
-    prev_device = pre_call(A.device)
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.float32)
-    is_on_gpu([code, A, out])
-    lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
-    post_call(prev_device)
-    return out
-
-
-def optimizer_update_32bit(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    beta1: float,
-    eps: float,
-    step: int,
-    lr: float,
-    state2: Tensor = None,
-    beta2: float = 0.0,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    unorm_vec: Tensor = None,
-    max_unorm: float = 0.0,
-    skip_zeros=False,
-) -> None:
-    """
-    Performs an inplace optimizer update with one or two optimizer states.
-
-    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer: {adam}.
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Optimizer state 1.
-    beta1 : float
-        Optimizer beta1.
-    eps : float
-        Optimizer epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    state2 : torch.Tensor
-        Optimizer state 2.
-    beta2 : float
-        Optimizer beta2.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    skip_zeros : bool
-        Whether to skip zero-valued gradients or not (default: False).
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    optim_func = None
-    if g.dtype == torch.float32:
-        optim_func = str2optimizer32bit[optimizer_name][0]
-    elif g.dtype == torch.float16:
-        optim_func = str2optimizer32bit[optimizer_name][1]
-    elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
-        optim_func = str2optimizer32bit[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-
-    is_on_gpu([g, p, state1, state2, unorm_vec])
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(g),
-        get_ptr(p),
-        get_ptr(state1),
-        get_ptr(state2),
-        get_ptr(unorm_vec),
-        ct.c_float(max_unorm),
-        ct.c_float(param_norm),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_float(weight_decay),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
-    )
-    post_call(prev_device)
-
-
-def optimizer_update_8bit(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    state2: Tensor,
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    lr: float,
-    qmap1: Tensor,
-    qmap2: Tensor,
-    max1: Tensor,
-    max2: Tensor,
-    new_max1: Tensor,
-    new_max2: Tensor,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    unorm_vec: Tensor = None,
-    max_unorm: float = 0.0,
-) -> None:
-    """
-    Performs an inplace Adam update.
-
-    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
-    Uses AdamW formulation if weight decay > 0.0.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer. Choices {adam, momentum}
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Adam state 1.
-    state2 : torch.Tensor
-        Adam state 2.
-    beta1 : float
-        Adam beta1.
-    beta2 : float
-        Adam beta2.
-    eps : float
-        Adam epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    qmap1 : torch.Tensor
-        Quantization map for first Adam state.
-    qmap2 : torch.Tensor
-        Quantization map for second Adam state.
-    max1 : torch.Tensor
-        Max value for first Adam state update.
-    max2 : torch.Tensor
-        Max value for second Adam state update.
-    new_max1 : torch.Tensor
-        Max value for the next Adam update of the first state.
-    new_max2 : torch.Tensor
-        Max value for the next Adam update of the second state.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, unorm_vec, qmap1, qmap2, max1, max2, new_max1, new_max2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        str2optimizer8bit[optimizer_name][0](
-            get_ptr(p),
-            get_ptr(g),
-            get_ptr(state1),
-            get_ptr(state2),
-            get_ptr(unorm_vec),
-            ct.c_float(max_unorm),
-            ct.c_float(param_norm),
-            ct.c_float(beta1),
-            ct.c_float(beta2),
-            ct.c_float(eps),
-            ct.c_int32(step),
-            ct.c_float(lr),
-            get_ptr(qmap1),
-            get_ptr(qmap2),
-            get_ptr(max1),
-            get_ptr(max2),
-            get_ptr(new_max1),
-            get_ptr(new_max2),
-            ct.c_float(weight_decay),
-            ct.c_float(gnorm_scale),
-            ct.c_int32(g.numel()),
-        )
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        str2optimizer8bit[optimizer_name][1](
-            get_ptr(p),
-            get_ptr(g),
-            get_ptr(state1),
-            get_ptr(state2),
-            get_ptr(unorm_vec),
-            ct.c_float(max_unorm),
-            ct.c_float(param_norm),
-            ct.c_float(beta1),
-            ct.c_float(beta2),
-            ct.c_float(eps),
-            ct.c_int32(step),
-            ct.c_float(lr),
-            get_ptr(qmap1),
-            get_ptr(qmap2),
-            get_ptr(max1),
-            get_ptr(max2),
-            get_ptr(new_max1),
-            get_ptr(new_max2),
-            ct.c_float(weight_decay),
-            ct.c_float(gnorm_scale),
-            ct.c_int32(g.numel()),
-        )
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-    post_call(prev_device)
-
-
-def optimizer_update_8bit_blockwise(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    state2: Tensor,
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    lr: float,
-    qmap1: Tensor,
-    qmap2: Tensor,
-    absmax1: Tensor,
-    absmax2: Tensor,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    skip_zeros=False,
-) -> None:
-    optim_func = None
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
-    elif (
-        g.dtype == torch.bfloat16
-        and state1.dtype == torch.uint8
-        and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
-    ):
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-    post_call(prev_device)
-
-    is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
-
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(p),
-        get_ptr(g),
-        get_ptr(state1),
-        get_ptr(state2),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        get_ptr(qmap1),
-        get_ptr(qmap2),
-        get_ptr(absmax1),
-        get_ptr(absmax2),
-        ct.c_float(weight_decay),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
-    )
-    post_call(prev_device)
-
-
-def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
-    """Applies percentile clipping
-
-    grad: torch.Tensor
-        The gradient tensor.
-    gnorm_vec: torch.Tensor
-        Vector of gradient norms. 100 elements expected.
-    step: int
-        The current optimiation steps (number of past gradient norms).
-
-    """
-    prev_device = pre_call(grad.device)
-    is_on_gpu([grad, gnorm_vec])
-    if grad.dtype == torch.float32:
-        lib.cpercentile_clipping_g32(
-            get_ptr(grad),
-            get_ptr(gnorm_vec),
-            ct.c_int32(step),
-            ct.c_int32(grad.numel()),
-        )
-    elif grad.dtype == torch.float16:
-        lib.cpercentile_clipping_g16(
-            get_ptr(grad),
-            get_ptr(gnorm_vec),
-            ct.c_int32(step),
-            ct.c_int32(grad.numel()),
-        )
-    else:
-        raise ValueError(f"Gradient type {grad.dtype} not supported!")
-    post_call(prev_device)
-
-    current_gnorm = torch.sqrt(gnorm_vec[step % 100])
-    vals, idx = torch.sort(gnorm_vec)
-    clip_value = torch.sqrt(vals[percentile])
-    gnorm_scale = 1.0
-
-    if current_gnorm > clip_value:
-        gnorm_scale = clip_value / current_gnorm
-
-    return current_gnorm, clip_value, gnorm_scale
-
-
-def histogram_scatter_add_2d(histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor):
-    assert len(histogram.shape) == 2
-    assert histogram.dtype == torch.float32
-    assert source.dtype == torch.float32
-    assert index1.dtype == torch.int32
-    assert index2.dtype == torch.int32
-
-    assert histogram.device.type == "cuda"
-    assert index1.device.type == "cuda"
-    assert index2.device.type == "cuda"
-    assert source.device.type == "cuda"
-
-    maxdim1 = ct.c_int32(histogram.shape[0])
-    n = ct.c_int32(index1.numel())
-    is_on_gpu([histogram, index1, index2, source])
-    lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
-
-
-def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
-    if not torch.cuda.is_initialized():
-        torch.cuda.init()
-    if A.dtype != expected_type or B.dtype != expected_type:
-        raise TypeError(f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}")
-
-    sA = A.shape
-    sB = B.shape
-    tA = transposed_A
-    tB = transposed_B
-
-    correct = True
-
-    if len(sA) == 2 and len(sB) == 2:
-        if not tA and not tB and A.shape[1] != B.shape[0]:
-            correct = False
-        elif tA and not tB and A.shape[0] != B.shape[0]:
-            correct = False
-        elif tA and tB and A.shape[0] != B.shape[1]:
-            correct = False
-        elif not tA and tB and A.shape[1] != B.shape[1]:
-            correct = False
-    elif len(sA) == 3 and len(sB) == 2:
-        if not tA and not tB and A.shape[2] != B.shape[0]:
-            correct = False
-        elif tA and not tB and A.shape[1] != B.shape[0]:
-            correct = False
-        elif tA and tB and A.shape[1] != B.shape[1]:
-            correct = False
-        elif not tA and tB and A.shape[2] != B.shape[1]:
-            correct = False
-    elif len(sA) == 3 and len(sB) == 3:
-        if not tA and not tB and A.shape[2] != B.shape[1]:
-            correct = False
-        elif tA and not tB and A.shape[1] != B.shape[1]:
-            correct = False
-        elif tA and tB and A.shape[1] != B.shape[2]:
-            correct = False
-        elif not tA and tB and A.shape[2] != B.shape[2]:
-            correct = False
-
-    if out is not None:
-        sout = out.shape
-        # special case common in backprop
-        if not correct and len(sA) == 3 and len(sB) == 3:
-            if sout[0] == sA[2] and sout[1] == sB[2] and sA[0] == sB[0] and sA[1] == sB[1]:
-                correct = True
-    else:
-        if len(sA) == 2 and len(sB) == 2:
-            if not tA and not tB:
-                sout = (sA[0], sB[1])
-            elif tA and tB:
-                sout = (sA[1], sB[0])
-            elif tA and not tB:
-                sout = (sA[1], sB[1])
-            elif not tA and tB:
-                sout = (sA[0], sB[0])
-        elif len(sA) == 3 and len(sB) == 2:
-            if not tA and not tB:
-                sout = (sA[0], sA[1], sB[1])
-            elif tA and tB:
-                sout = (sA[0], sA[2], sB[0])
-            elif tA and not tB:
-                sout = (sA[0], sA[2], sB[1])
-            elif not tA and tB:
-                sout = (sA[0], sA[1], sB[0])
-        elif len(sA) == 3 and len(sB) == 3:
-            if not tA and not tB:
-                sout = (sA[0], sA[1], sB[2])
-            elif tA and tB:
-                sout = (sA[0], sA[2], sB[1])
-            elif tA and not tB:
-                sout = (sA[0], sA[2], sB[2])
-            elif not tA and tB:
-                sout = (sA[0], sA[1], sB[1])
-
-    if not correct:
-        raise ValueError(
-            f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}."
-        )
-
-    return sout
-
-
-def cutlass3_gemm(A: Tensor, B: Tensor, out: Tensor = None, transposed_A=False, transposed_B=False, state=None):
-    # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
-    if state is None:
-        Bshape = B.shape
-        bout = Bshape[1]
-    else:
-        Bshape = state[1]
-        bout = Bshape[0]
-    if out is None:
-        out = torch.zeros(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
-
-    sA = A.shape
-    sB = B.shape
-    if transposed_A and len(sA) == 2:
-        sA = (sA[1], sA[0])
-    elif transposed_A and len(sA) == 3:
-        sA = (sA[0], sA[2], sA[0])
-    if transposed_B and len(sB) == 2:
-        sB = (sB[1], sB[0])
-    elif transposed_B and len(sB) == 3:
-        sB = (sB[0], sB[2], sB[0])
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-
-    # matrices in the input arguments for cuBLAS
-    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
-    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
-    if len(sB) == 2:
-        if B.stride()[0] == B.shape[1]:
-            transposed_B = False
-        elif B.stride()[1] == B.shape[0]:
-            transposed_B = True
-        if len(A.shape) == 2:
-            if A.stride()[0] == A.shape[1]:
-                transposed_A = False
-            elif A.stride()[1] == A.shape[0]:
-                transposed_A = True
-        else:
-            if A.stride()[1] == A.shape[2]:
-                transposed_A = False
-            elif A.stride()[2] == A.shape[1]:
-                transposed_A = True
-
-        if len(sA) == 2:
-            n = sA[0]
-            ldb = A.stride()[1 if transposed_A else 0]
-        elif len(sA) == 3 and len(sB) == 2:
-            n = sA[0] * sA[1]
-            ldb = sA[2]
-
-        m = sB[1]
-        k = sB[0]
-        lda = B.stride()[0]
-        ldc = sB[1]
-    elif len(sB) == 3:
-        # special case
-        assert len(sA) == 3
-        if not (sA[0] == sB[0] and sA[1] == sB[1]):
-            raise ValueError(
-                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
-            )
-
-        transposed_A = True
-        transposed_B = False
-
-        m = sB[2]
-        n = sA[2]
-        k = sB[0] * sB[1]
-
-        lda = n
-        ldb = sA[2]
-        ldc = m
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    # B^T @ A^T = C^T
-    # [km, nk -> mn]
-    # lda = ldb = ldc = 1
-    # lda = 1
-    if state is not None:
-        m = Bshape[0]
-        k = Bshape[1]
-        lda = Bshape[0]
-        ldc = Bshape[0]
-        ldb = (ldb + 1) // 2
-    # print(m, n, k, lda, ldb, ldc)
-    is_on_gpu([B, A, out])
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-    lda = ct.c_int32(lda)
-    ldb = ct.c_int32(ldb)
-    ldc = ct.c_int32(ldc)
-
-    if B.dtype == torch.uint8:
-        lib.cgemm_4bit_inference(
-            m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3])
-        )
-    elif A.dtype == torch.float32:
-        lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
-    elif A.dtype == torch.float16:
-        lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
-    else:
-        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    return out
-
-
-def igemm(
-    A: Tensor,
-    B: Tensor,
-    out: Tensor = None,
-    transposed_A=False,
-    transposed_B=False,
-):
-    sout = check_matmul(A, B, out, transposed_A, transposed_B)
-    if out is None:
-        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
-    if len(A.shape) == 3 and len(B.shape) == 3:
-        if A.shape[0] == B.shape[0] and A.shape[2] == B.shape[1]:
-            return batched_igemm(A, B, out)
-
-    sA = A.shape
-    sB = B.shape
-    if transposed_A and len(sA) == 2:
-        sA = (sA[1], sA[0])
-    elif transposed_A and len(sA) == 3:
-        sA = (sA[0], sA[2], sA[0])
-    if transposed_B and len(sB) == 2:
-        sB = (sB[1], sB[0])
-    elif transposed_B and len(sB) == 3:
-        sB = (sB[0], sB[2], sB[0])
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-
-    # matrices in the input arguments for cuBLAS
-    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
-    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
-    if len(sB) == 2:
-        if B.stride()[0] == B.shape[1]:
-            transposed_B = False
-        elif B.stride()[1] == B.shape[0]:
-            transposed_B = True
-        if len(A.shape) == 2:
-            if A.stride()[0] == A.shape[1]:
-                transposed_A = False
-            elif A.stride()[1] == A.shape[0]:
-                transposed_A = True
-        else:
-            if A.stride()[1] == A.shape[2]:
-                transposed_A = False
-            elif A.stride()[2] == A.shape[1]:
-                transposed_A = True
-
-        if len(sA) == 2:
-            n = sA[0]
-            ldb = A.stride()[1 if transposed_A else 0]
-        elif len(sA) == 3 and len(sB) == 2:
-            n = sA[0] * sA[1]
-            ldb = sA[2]
-
-        m = sB[1]
-        k = sB[0]
-        lda = B.stride()[(1 if transposed_B else 0)]
-        ldc = sB[1]
-    elif len(sB) == 3:
-        # special case
-        assert len(sA) == 3
-        if not (sA[0] == sB[0] and sA[1] == sB[1]):
-            raise ValueError(
-                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
-            )
-
-        transposed_A = True
-        transposed_B = False
-
-        m = sB[2]
-        n = sA[2]
-        k = sB[0] * sB[1]
-
-        lda = m
-        ldb = sA[2]
-        ldc = m
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    # B^T @ A^T = C^T
-    # [km, nk -> mn]
-    is_on_gpu([B, A, out])
-    lib.cigemm(
-        ptr,
-        ct.c_bool(transposed_B),
-        ct.c_bool(transposed_A),
-        ct.c_int32(m),
-        ct.c_int32(n),
-        ct.c_int32(k),
-        get_ptr(B),
-        get_ptr(A),
-        get_ptr(out),
-        ct.c_int32(lda),
-        ct.c_int32(ldb),
-        ct.c_int32(ldc),
-    )
-    return out
-
-
-def batched_igemm(
-    A: Tensor,
-    B: Tensor,
-    out: Tensor = None,
-    transposed_A=False,
-    transposed_B=False,
-):
-    if not len(A.shape) == 3 or not len(B.shape) == 3:
-        raise ValueError(f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}")
-    sout = check_matmul(A, B, out, transposed_A, transposed_B)
-    if out is None:
-        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
-
-    if B.is_contiguous():
-        lda = B.stride()[1]
-        transposed_A = False
-    else:
-        s = B.stride()
-        if s[0] != B.shape[0]:
-            B = B.contiguous()
-            lda = B.stride()[1]
-        elif s[2] == B.shape[1]:
-            transposed_A = True
-            lda = B.stride()[2]
-        else:
-            if s[2] == 1:
-                B = B.contiguous()
-                lda = B.stride()[1]
-            elif s[1] == 1:
-                B = B.contiguous()
-                lda = B.stride()[1]
-            else:
-                B = B.contiguous()
-                lda = B.stride()[1]
-
-    if A.is_contiguous():
-        ldb = A.stride()[1]
-        transposed_B = False
-    else:
-        s = A.stride()
-        if s[0] != A.shape[0]:
-            A = A.contiguous()
-            ldb = A.stride()[1]
-            transposed_B = False
-        elif s[2] == A.shape[1]:
-            ldb = A.stride()[2]
-            transposed_B = True
-        else:
-            A = A.contiguous()
-            ldb = A.stride()[1]
-            transposed_B = False
-
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-    # matrices in the input arguments for cuBLAS
-
-    # column major: A @ B = C: [batch, m, k] @ [batch, k, n] = [batch, m, n]
-    # row major: B^T @ A^T = C^T: [batch, m, k] @ [batch, k, n] = [batch, m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [batch, k, m] @ [batch, n, k] = [batch, n, m]
-    num_batch = A.shape[0]
-    n = A.shape[1]
-    m = B.shape[2]
-    k = B.shape[1]
-
-    ldc = m
-
-    strideA = B.shape[1] * B.shape[2]
-    strideB = A.shape[1] * A.shape[2]
-    strideC = A.shape[1] * B.shape[2]
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    is_on_gpu([B, A, out])
-    lib.cbatched_igemm(
-        ptr,
-        ct.c_bool(transposed_B),
-        ct.c_bool(transposed_A),
-        ct.c_int32(m),
-        ct.c_int32(n),
-        ct.c_int32(k),
-        get_ptr(B),
-        get_ptr(A),
-        get_ptr(out),
-        ct.c_int32(lda),
-        ct.c_int32(ldb),
-        ct.c_int32(ldc),
-        ct.c_long(strideA),
-        ct.c_long(strideB),
-        ct.c_long(strideC),
-        ct.c_uint32(num_batch),
-    )
-    return out
-
-
-def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    shapeA = SA[0]
-    shapeB = SB[0]
-    dimsA = len(shapeA)
-    dimsB = len(shapeB)
-    assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
-    if dimsA == 2:
-        m = shapeA[0]
-    elif dimsA == 3:
-        m = shapeA[0] * shapeA[1]
-
-    rows = n = shapeB[0]
-    assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}"
-
-    # if the tensor is empty, return a transformed empty tensor with the right dimensions
-    if shapeA[0] == 0 and dimsA == 2:
-        return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
-    elif shapeA[1] == 0 and dimsA == 3:
-        return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
-
-    if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
-    elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
-
-    assert dimsB != 3, "len(B.shape)==3 not supported"
-    assert A.device.type == "cuda"
-    assert B.device.type == "cuda"
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    assert out.dtype == dtype
-    assert SA[1] == "col32"
-    assert SB[1] in ["col_turing", "col_ampere"]
-    assert Sout[1] == "col32"
-    assert (
-        shapeA[-1] == shapeB[-1]
-    ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
-    formatB = SB[1]
-    prev_device = A.device
-    torch.cuda.set_device(A.device)
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-    ptrA = get_ptr(A)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-
-    k = shapeA[-1]
-    lda = ct.c_int32(m * 32)
-    if formatB == "col_turing":
-        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
-    else:
-        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-    ldc = ct.c_int32(m * 32)
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-
-    has_error = 0
-    ptrRowScale = get_ptr(None)
-    is_on_gpu([A, B, out])
-    if formatB == "col_turing":
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-        else:
-            has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-    elif formatB == "col_ampere":
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-        else:
-            has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-
-    if has_error == 1:
-        print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}")
-        raise Exception("cublasLt ran into an error!")
-
-    torch.cuda.set_device(prev_device)
-
-    return out, Sout
-
-
-def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None):
-    assert A.dtype == torch.int32
-    if bias is not None:
-        assert bias.dtype == torch.float16
-    out_shape = quant_state[0]
-    if len(out_shape) == 3:
-        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
-
-    if out is None:
-        out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
-    if new_row_stats is None:
-        new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device)
-    if new_col_stats is None:
-        new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device)
-    assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}"
-    assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}"
-
-    prev_device = pre_call(A.device)
-    ptrA = get_ptr(A)
-    ptrOut = get_ptr(out)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNewRowStats = get_ptr(new_row_stats)
-    ptrNewColStats = get_ptr(new_col_stats)
-    ptrBias = get_ptr(bias)
-    numRows = ct.c_int32(out_shape[0])
-    numCols = ct.c_int32(out_shape[1])
-
-    is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-    lib.cdequant_mm_int32_fp16(
-        ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols
-    )
-    post_call(prev_device)
-
-    return out
-
-
-def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0):
-    assert A.dtype == torch.float16
-    device = A.device
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    col_tiles = (cols + 255) // 256
-    tiled_rows = ((rows + 15) // 16) * 16
-    if row_stats is None:
-        row_stats = torch.empty((rows,), dtype=torch.float32, device=device).fill_(-50000.0)
-    if col_stats is None:
-        col_stats = torch.empty((cols,), dtype=torch.float32, device=device).fill_(-50000.0)
-
-    if nnz_block_ptr is None and threshold > 0.0:
-        nnz_block_ptr = torch.zeros(((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device)
-
-    ptrA = get_ptr(A)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNnzrows = get_ptr(nnz_block_ptr)
-    rows = ct.c_int32(rows)
-    cols = ct.c_int32(cols)
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
-    lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
-    post_call(prev_device)
-
-    if threshold > 0.0:
-        nnz_block_ptr.cumsum_(0)
-
-    return row_stats, col_stats, nnz_block_ptr
-
-
-class COOSparseTensor:
-    def __init__(self, rows, cols, nnz, rowidx, colidx, values):
-        assert rowidx.dtype == torch.int32
-        assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert rowidx.numel() == nnz
-        assert colidx.numel() == nnz
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.rowidx = rowidx
-        self.colidx = colidx
-        self.values = values
-
-
-class CSRSparseTensor:
-    def __init__(self, rows, cols, nnz, rowptr, colidx, values):
-        assert rowptr.dtype == torch.int32
-        assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert colidx.numel() == nnz
-        assert rowptr.numel() == rows + 1
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.rowptr = rowptr
-        self.colidx = colidx
-        self.values = values
-
-
-class CSCSparseTensor:
-    def __init__(self, rows, cols, nnz, colptr, rowidx, values):
-        assert colptr.dtype == torch.int32
-        assert rowidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert rowidx.numel() == nnz
-        assert colptr.numel() == cols + 1
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.colptr = colptr
-        self.rowidx = rowidx
-        self.values = values
-
-
-def coo2csr(cooA):
-    values, counts = torch.unique(cooA.rowidx, return_counts=True)
-    values.add_(1)
-    rowptr = torch.zeros((cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device)
-    rowptr.scatter_(index=values.long(), src=counts.int(), dim=0)
-    rowptr.cumsum_(0)
-    return CSRSparseTensor(cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values)
-
-
-def coo2csc(cooA):
-    val, col2rowidx = torch.sort(cooA.colidx)
-    rowidx = cooA.rowidx[col2rowidx]
-    values = cooA.values[col2rowidx]
-    colvalues, counts = torch.unique(val, return_counts=True)
-    colvalues.add_(1)
-    colptr = torch.zeros((cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device)
-    colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0)
-    colptr.cumsum_(0)
-    return CSCSparseTensor(cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values)
-
-
-def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
-    rowidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
-    colidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
-    values = torch.zeros((nnz,), dtype=dtype, device=device)
-    return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
-
-
-def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-    device = A.device
-    assert A.dtype == torch.half
-    assert device.type == "cuda"
-    prev_device = pre_call(A.device)
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    if row_stats is None or col_stats is None:
-        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold)
-
-    if out_col is None:
-        out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
-    if out_row is None:
-        out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
-
-    coo_tensor = None
-    ptrA = get_ptr(A)
-    ptrColStats = get_ptr(col_stats)
-    ptrRowStats = get_ptr(row_stats)
-    ptrOutCol = get_ptr(out_col)
-    ptrOutRow = get_ptr(out_row)
-
-    is_on_gpu([A, col_stats, row_stats, out_col, out_row])
-    if threshold > 0.0:
-        nnz = nnz_row_ptr[-1].item()
-        if nnz > 0:
-            coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device)
-            ptrRowIdx = get_ptr(coo_tensor.rowidx)
-            ptrColIdx = get_ptr(coo_tensor.colidx)
-            ptrVal = get_ptr(coo_tensor.values)
-            ptrRowPtr = get_ptr(nnz_row_ptr)
-
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                ptrRowIdx,
-                ptrColIdx,
-                ptrVal,
-                ptrRowPtr,
-                ct.c_float(threshold),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-            val, idx = torch.sort(coo_tensor.rowidx)
-            coo_tensor.rowidx = val
-            coo_tensor.colidx = coo_tensor.colidx[idx]
-            coo_tensor.values = coo_tensor.values[idx]
-        else:
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                None,
-                None,
-                None,
-                None,
-                ct.c_float(0.0),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-    else:
-        lib.cdouble_rowcol_quant(
-            ptrA,
-            ptrRowStats,
-            ptrColStats,
-            ptrOutCol,
-            ptrOutRow,
-            None,
-            None,
-            None,
-            None,
-            ct.c_float(threshold),
-            ct.c_int32(rows),
-            ct.c_int32(cols),
-        )
-    post_call(prev_device)
-
-    return out_row, out_col, row_stats, col_stats, coo_tensor
-
-
-def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
-    prev_device = pre_call(A.device)
-    if state is None:
-        state = (A.shape, from_order)
-    else:
-        from_order = state[1]
-    if out is None:
-        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-    else:
-        new_state = (state[0], to_order)  # (shape, order)
-
-    shape = state[0]
-    if len(shape) == 2:
-        dim1 = ct.c_int32(shape[0])
-        dim2 = ct.c_int32(shape[1])
-    else:
-        dim1 = ct.c_int32(shape[0] * shape[1])
-        dim2 = ct.c_int32(shape[2])
-
-    is_on_gpu([A, out])
-    if to_order == "col32":
-        if transpose:
-            lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_turing":
-        if transpose:
-            lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_ampere":
-        if transpose:
-            lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "row":
-        if from_order == "col_turing":
-            lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif from_order == "col_ampere":
-            lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
-    else:
-        raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}")
-
-    post_call(prev_device)
-
-    return out, new_state
-
-
-def spmm_coo(cooA, B, out=None):
-    if out is None:
-        out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype)
-    nnz = cooA.nnz
-    assert cooA.rowidx.numel() == nnz
-    assert cooA.colidx.numel() == nnz
-    assert cooA.values.numel() == nnz
-    assert cooA.cols == B.shape[0]
-
-    transposed_B = False if B.is_contiguous() else True
-
-    ldb = B.stride()[(1 if transposed_B else 0)]
-    ldc = B.shape[1]
-
-    ptr = Cusparse_Context.get_instance().context
-
-    ptrRowidx = get_ptr(cooA.rowidx)
-    ptrColidx = get_ptr(cooA.colidx)
-    ptrValues = get_ptr(cooA.values)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-    cnnz = ct.c_int32(cooA.nnz)
-    crowsA = ct.c_int32(cooA.rows)
-    ccolsA = ct.c_int32(cooA.cols)
-    ccolsB = ct.c_int32(B.shape[1])
-    cldb = ct.c_int32(ldb)
-    cldc = ct.c_int32(ldc)
-
-    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out])
-    lib.cspmm_coo(
-        ptr,
-        ptrRowidx,
-        ptrColidx,
-        ptrValues,
-        cnnz,
-        crowsA,
-        ccolsA,
-        ccolsB,
-        cldb,
-        ptrB,
-        cldc,
-        ptrC,
-        ct.c_bool(transposed_B),
-    )
-
-    return out
-
-
-def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
-    if out is None:
-        out = torch.zeros((cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype)
-    nnz = cooA.nnz
-    prev_device = pre_call(B.device)
-    assert cooA.rowidx.numel() == nnz
-    assert cooA.colidx.numel() == nnz
-    assert cooA.values.numel() == nnz
-    assert cooA.cols == B.shape[0], f"{cooA.cols} vs {B.shape}"
-
-    transposed_B = False if B.is_contiguous() else True
-
-    ldb = B.stride()[(1 if transposed_B else 0)]
-    ldc = B.shape[1]
-
-    values, counts = torch.unique(cooA.rowidx, return_counts=True)
-    offset = counts.cumsum(0).int()
-    max_count, max_idx = torch.sort(counts, descending=True)
-    max_idx = max_idx.int()
-    max_count = max_count.int()
-    assert max_count[0] <= 32, f"Current max count per row is 8 but found {max_count[0]}."
-    assert B.dtype in [torch.float16, torch.int8]
-    ptrOffset = get_ptr(offset)
-    ptrMaxCount = get_ptr(max_count)
-    ptrMaxIdx = get_ptr(max_idx)
-
-    ptrRowidx = get_ptr(cooA.rowidx)
-    ptrColidx = get_ptr(cooA.colidx)
-    ptrValues = get_ptr(cooA.values)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-    ptrDequantStats = get_ptr(dequant_stats)
-    cnnz_rows = ct.c_int32(counts.numel())
-    cnnz = ct.c_int32(cooA.nnz)
-    crowsA = ct.c_int32(cooA.rows)
-    ccolsA = ct.c_int32(cooA.cols)
-    crowsB = ct.c_int32(B.shape[1])
-    ccolsB = ct.c_int32(B.shape[1])
-    cldb = ct.c_int32(ldb)
-    cldc = ct.c_int32(ldc)
-
-    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats])
-    if B.dtype == torch.float16:
-        lib.cspmm_coo_very_sparse_naive_fp16(
-            ptrMaxCount,
-            ptrMaxIdx,
-            ptrOffset,
-            ptrRowidx,
-            ptrColidx,
-            ptrValues,
-            ptrB,
-            ptrC,
-            ptrDequantStats,
-            cnnz_rows,
-            cnnz,
-            crowsA,
-            crowsB,
-            ccolsB,
-        )
-    elif B.dtype == torch.int8:
-        lib.cspmm_coo_very_sparse_naive_int8(
-            ptrMaxCount,
-            ptrMaxIdx,
-            ptrOffset,
-            ptrRowidx,
-            ptrColidx,
-            ptrValues,
-            ptrB,
-            ptrC,
-            ptrDequantStats,
-            cnnz_rows,
-            cnnz,
-            crowsA,
-            crowsB,
-            ccolsB,
-        )
-    # else: assertion error
-    post_call(prev_device)
-
-    return out
-
-
-C = 127.0
-
-
-def vectorwise_quant(x, dim=1, quant_type="vector"):
-    if quant_type == "linear":
-        max1 = torch.abs(x).max().float()
-        xq = torch.round(x / max1 * 127).to(torch.int8)
-        return xq, max1
-    elif quant_type in ["vector", "row"]:
-        max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
-        xq = torch.round(x * (C / max1)).to(torch.int8)
-        return xq, max1
-    elif quant_type == "zeropoint":
-        dtype = x.dtype
-        x = x.float()
-        dyna = x.max() - x.min()
-        if dyna == 0:
-            dyna = 1
-        qx = 255.0 / dyna
-        minx = x.min()
-        zpx = torch.round(minx * qx)
-        x = torch.round(qx * x - zpx) + zpx
-        return x, qx
-    elif quant_type in ["vector-zeropoint", "row-zeropoint"]:
-        dtype = x.dtype
-        x = x.float()
-        dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(x, dim=dim, keepdim=True)
-        dyna[dyna == 0] = 1
-        qx = 255.0 / dyna
-        minx = torch.amin(x, dim=dim, keepdim=True)
-        zpx = torch.round(minx * qx)
-        x = torch.round(qx * x - zpx) + zpx
-        return x, qx
-    elif quant_type == "truncated-vector":
-        with torch.no_grad():
-            absx = torch.abs(x)
-            max1 = torch.amax(absx, dim=dim, keepdim=True)
-            max1 = max1 * 0.7
-            idx = absx > max1.expand_as(absx)
-            sign = torch.sign(x[idx])
-            x[idx] = max1.expand_as(absx)[idx] * sign
-            xq = torch.round(x / max1 * C).to(torch.int8)
-        return xq, max1
-    else:
-        return None
-
-
-def vectorwise_dequant(xq, max1, quant_type="vector"):
-    if quant_type == "vector":
-        x = (xq / C * max1).to(torch.float32)
-        return x
-    else:
-        return None
-
-
-def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
-    if quant_type == "linear":
-        norm = S1 * S2 / (C * C)
-        # double cast needed to prevent overflows
-        return (xq.float() * norm).to(dtype)
-    elif quant_type == "zeropoint":
-        norm = 1.0 / (S1 * S2)
-        return (xq.float() * norm).to(dtype)
-    elif quant_type == "row-zeropoint":
-        norm = 1.0 / (S1 * S2)
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= norm
-        else:
-            x *= norm
-        return x.to(dtype)
-    elif quant_type == "vector-zeropoint":
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= 1.0 / S1
-        else:
-            x *= 1.0 / S1
-        x *= 1.0 / S2.t()
-        return x.to(dtype)
-    elif quant_type == "row":
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= S1 * S2 / (C * C)
-        else:
-            x *= S1 * S2 / (C * C)
-        return x.to(dtype)
-    elif quant_type in ["truncated-vector", "vector"]:
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= S1 / C
-        else:
-            x *= S1 / C
-        x *= S2 / C
-        return x.to(dtype)
-    else:
-        return None
-
-
-def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
-    offset = B.float().t().sum(0) * (SA[0] + SA[1])
-    x = xq.float()
-    if len(xq.shape) == 2 and len(SB.shape) == 3:
-        SB = SB.squeeze(0)
-    if len(SB.shape) == 2:
-        x *= SB.t() / 127
-    else:
-        x *= SB / 127
-    x *= SA[1] / 127
-    x += offset
-    return x.to(dtype)
-
-
-def extract_outliers(A, SA, idx):
-    shapeA = SA[0]
-    formatA = SA[1]
-    assert formatA in ["col_turing", "col_ampere"]
-    assert A.device.type == "cuda"
-
-    out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
-
-    idx_size = ct.c_int32(idx.numel())
-    rows = ct.c_int32(shapeA[0])
-    cols = ct.c_int32(shapeA[1])
-    ptrA = get_ptr(A)
-    ptrIdx = get_ptr(idx)
-    ptrOut = get_ptr(out)
-
-    prev_device = pre_call(A.device)
-    if formatA == "col_turing":
-        lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    elif formatA == "col_ampere":
-        lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    post_call(prev_device)
-
-    return out
-
-
-def pipeline_test(A, batch_size):
-    out = torch.zeros_like(A)
-    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out

From f76d6abce340060af7d458abe12546cd21891a75 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:33:54 +0000
Subject: [PATCH 143/234] Sync README with upstream

---
 README.md | 39 +++------------------------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 377ca2e86..2cf630dcb 100644
--- a/README.md
+++ b/README.md
@@ -6,42 +6,9 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
 
 The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
-**Installation for ROCm:**
-
-To install develop version:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-git checkout rocm_enabled
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . (Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch)
-make
-pip install .
-```
-
-For ROCm specific versions:
-
-Install Dependencies:
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-Install Bitsandbytes:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-
-**For more details, please head to the official documentation page:**
+There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well.
+
+**Please head to the official documentation page:**
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 

From 576b62cde8eaa7ad9f7b5aa35051e9cce07e0a93 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:36:07 +0000
Subject: [PATCH 144/234] Remove bnb_accuracy file

---
 benchmarking/accuracy/bnb_accuracy.py | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 benchmarking/accuracy/bnb_accuracy.py

diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py
deleted file mode 100644
index 2860338ec..000000000
--- a/benchmarking/accuracy/bnb_accuracy.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import torch
-
-from bitsandbytes import functional as F
-
-
-def debug_blocksize(block):
-    x = torch.randn(4096, 4096).cuda()
-    qx, qstate = F.quantize_fp4(x, blocksize=block)
-    dq = F.dequantize_fp4(qx, qstate)
-    return torch.sum(torch.linalg.norm(x - dq, ord="fro"))
-
-
-def test_blocksize(block):
-    x = torch.randn(10, 10).cuda()
-    qx, qstate = F.quantize_fp4(x, blocksize=block)
-    print(x)
-    print("---------------")
-    print(qx)
-    print("---------------")
-    print(qstate)
-
-
-for block in [128, 256, 512, 1024, 2048]:
-    print(debug_blocksize(block))
-
-# test_blocksize(2048)

From dfb531b7d6c9e9532340479a2bc4b09fccea5938 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:39:23 +0000
Subject: [PATCH 145/234] Remove cuda_setup

---
 bitsandbytes/cuda_setup/main.py | 451 --------------------------------
 1 file changed, 451 deletions(-)
 delete mode 100644 bitsandbytes/cuda_setup/main.py

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
deleted file mode 100644
index b2f9214a4..000000000
--- a/bitsandbytes/cuda_setup/main.py
+++ /dev/null
@@ -1,451 +0,0 @@
-"""
-extract factors the build is dependent on:
-[X] compute capability
-    [ ] TODO: Q - What if we have multiple GPUs of different makes?
-- CUDA version
-- Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiplication)
-    - CuBLAS-LT: full-build 8-bit optimizer
-    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
-
-evaluation:
-    - if paths faulty, return meaningful error
-    - else:
-        - determine CUDA version
-        - determine capabilities
-        - based on that set the default path
-"""
-
-import ctypes as ct
-import errno
-import os
-from pathlib import Path
-from typing import Set, Union
-from warnings import warn
-
-import torch
-
-from .env_vars import get_potentially_lib_path_containing_env_vars
-
-# these are the most common libs names
-# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
-# we have libcudart.so.11.0 which causes a lot of errors before
-# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
-CUDA_RUNTIME_LIBS: list = [
-    "libcudart.so",
-    "libcudart.so.11.0",
-    "libcudart.so.12.0",
-    "libcudart.so.12.1",
-    "libcudart.so.12.2",
-]
-
-# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
-backup_paths = []
-backup_paths.append("$CONDA_PREFIX/lib/libcudart.so.11.0")
-
-
-class CUDASetup:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def generate_instructions(self):
-        if getattr(self, "error", False):
-            return
-        print(self.error)
-        self.error = True
-        if not self.cuda_available:
-            self.add_log_entry(
-                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed."
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig."
-            )
-            self.add_log_entry("CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:")
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)"
-            )
-            return
-
-        if self.cudart_path is None:
-            self.add_log_entry(
-                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected."
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc"
-            )
-            self.add_log_entry("CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.")
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO."
-            )
-            self.add_log_entry(
-                'CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local'
-            )
-
-            return
-
-        make_cmd = f"CUDA_VERSION={self.cuda_version_string}"
-        if len(self.cuda_version_string) < 3:
-            make_cmd += " make cuda92"
-        elif self.cuda_version_string == "110":
-            make_cmd += " make cuda110"
-        elif self.cuda_version_string[:2] == "11" and int(self.cuda_version_string[2]) > 0:
-            make_cmd += " make cuda11x"
-        elif self.cuda_version_string[:2] == "12" and 1 >= int(self.cuda_version_string[2]) >= 0:
-            make_cmd += " make cuda12x"
-        elif self.cuda_version_string == "100":
-            self.add_log_entry("CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.")
-            self.add_log_entry(
-                "CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables."
-            )
-            return
-
-        has_cublaslt = is_cublasLt_compatible(self.cc)
-        if not has_cublaslt:
-            make_cmd += "_nomatmul"
-
-        self.add_log_entry("CUDA SETUP: Something unexpected happened. Please compile from source:")
-        self.add_log_entry("git clone https://github.com/TimDettmers/bitsandbytes.git")
-        self.add_log_entry("cd bitsandbytes")
-        self.add_log_entry(make_cmd)
-        self.add_log_entry("python setup.py install")
-
-    def initialize(self):
-        if not getattr(self, "initialized", False):
-            self.has_printed = False
-            self.lib = None
-            self.initialized = False
-            self.error = False
-
-    def manual_override(self):
-        if torch.cuda.is_available():
-            if "BNB_CUDA_VERSION" in os.environ:
-                if len(os.environ["BNB_CUDA_VERSION"]) > 0:
-                    warn(
-                        f'\n\n{"="*80}\n'
-                        'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
-                        'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
-                        'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
-                        'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
-                        'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
-                        f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
-                        f'\n{"="*80}\n\n'
-                    )
-                    self.binary_name = self.binary_name[:-6] + f'{os.environ["BNB_CUDA_VERSION"]}.so'
-
-    def run_cuda_setup(self):
-        self.initialized = True
-        self.cuda_setup_log = []
-
-        binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup()
-        self.cudart_path = cudart_path
-        self.cuda_available = torch.cuda.is_available()
-        self.cc = cc
-        self.cuda_version_string = cuda_version_string
-        self.binary_name = binary_name
-        self.manual_override()
-
-        package_dir = Path(__file__).parent.parent
-        binary_path = package_dir / self.binary_name
-
-        try:
-            if not binary_path.exists():
-                self.add_log_entry(
-                    f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?"
-                )
-                legacy_binary_name = "libbitsandbytes_cpu.so"
-                self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
-                binary_path = package_dir / legacy_binary_name
-                if not binary_path.exists() or torch.cuda.is_available():
-                    self.add_log_entry("")
-                    self.add_log_entry("=" * 48 + "ERROR" + "=" * 37)
-                    self.add_log_entry("CUDA SETUP: CUDA detection failed! Possible reasons:")
-                    self.add_log_entry(
-                        "1. You need to manually override the PyTorch CUDA version. Please see: "
-                        '"https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md'
-                    )
-                    self.add_log_entry("2. CUDA driver not installed")
-                    self.add_log_entry("3. CUDA not installed")
-                    self.add_log_entry("4. You have multiple conflicting CUDA libraries")
-                    self.add_log_entry("5. Required library not pre-compiled for this bitsandbytes release!")
-                    self.add_log_entry(
-                        "CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`."
-                    )
-                    self.add_log_entry(
-                        "CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`."
-                    )
-                    self.add_log_entry("=" * 80)
-                    self.add_log_entry("")
-                    self.generate_instructions()
-                    raise Exception("CUDA SETUP: Setup Failed!")
-                self.lib = ct.cdll.LoadLibrary(binary_path)
-            else:
-                self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
-                self.lib = ct.cdll.LoadLibrary(binary_path)
-        except Exception as ex:
-            # debug
-            self.add_log_entry("Exception in run_cuda_setup: \n")
-            self.add_log_entry(str(ex))
-
-    def add_log_entry(self, msg, is_warning=False):
-        self.cuda_setup_log.append((msg, is_warning))
-
-    def print_log_stack(self):
-        for msg, is_warning in self.cuda_setup_log:
-            if is_warning:
-                warn(msg)
-            else:
-                print(msg)
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-
-def is_cublasLt_compatible(cc):
-    has_cublaslt = False
-    if cc is not None:
-        cc_major, cc_minor = cc.split(".")
-        if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
-            CUDASetup.get_instance().add_log_entry(
-                "WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! \
-                    If you run into issues with 8-bit matmul, you can try 4-bit quantization: https://huggingface.co/blog/4bit-transformers-bitsandbytes",
-                is_warning=True,
-            )
-        else:
-            has_cublaslt = True
-    return has_cublaslt
-
-
-def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
-    return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path}
-
-
-def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
-    existent_directories: Set[Path] = set()
-    for path in candidate_paths:
-        try:
-            if path.exists():
-                existent_directories.add(path)
-        except PermissionError as pex:
-            # Handle the PermissionError first as it is a subtype of OSError
-            # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
-            pass
-        except OSError as exc:
-            if exc.errno != errno.ENAMETOOLONG:
-                raise exc
-
-    non_existent_directories: Set[Path] = candidate_paths - existent_directories
-    if non_existent_directories:
-        CUDASetup.get_instance().add_log_entry(
-            "The following directories listed in your path were found to "
-            f"be non-existent: {non_existent_directories}",
-            is_warning=False,
-        )
-
-    return existent_directories
-
-
-def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
-    paths = set()
-    for libname in CUDA_RUNTIME_LIBS:
-        for path in candidate_paths:
-            try:
-                if (path / libname).is_file():
-                    paths.add(path / libname)
-            except PermissionError:
-                pass
-    return paths
-
-
-def resolve_paths_list(paths_list_candidate: str) -> Set[Path]:
-    """
-    Searches a given environmental var for the CUDA runtime library,
-    i.e. `libcudart.so`.
-    """
-    return remove_non_existent_dirs(extract_candidate_paths(paths_list_candidate))
-
-
-def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]:
-    return get_cuda_runtime_lib_paths(resolve_paths_list(paths_list_candidate))
-
-
-def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
-    if len(results_paths) > 1:
-        warning_msg = (
-            f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
-            "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
-            "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
-            "For example, if you want to use the CUDA version 122"
-            "BNB_CUDA_VERSION=122 python ..."
-            "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122"
-            "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g."
-            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2"
-        )
-        CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)
-
-
-def determine_cuda_runtime_lib_path() -> Union[Path, None]:
-    """
-    Searches for a cuda installations, in the following order of priority:
-        1. active conda env
-        2. LD_LIBRARY_PATH
-        3. any other env vars, while ignoring those that
-            - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
-            - don't contain the path separator `/`
-
-    If multiple libraries are found in part 3, we optimistically try one,
-    while giving a warning message.
-    """
-    candidate_env_vars = get_potentially_lib_path_containing_env_vars()
-
-    cuda_runtime_libs = set()
-    if "CONDA_PREFIX" in candidate_env_vars:
-        conda_libs_path = Path(candidate_env_vars["CONDA_PREFIX"]) / "lib"
-
-        conda_cuda_libs = find_cuda_lib_in(str(conda_libs_path))
-        warn_in_case_of_duplicates(conda_cuda_libs)
-
-        if conda_cuda_libs:
-            cuda_runtime_libs.update(conda_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(
-            f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
-            is_warning=True,
-        )
-
-    if "LD_LIBRARY_PATH" in candidate_env_vars:
-        lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"])
-
-        if lib_ld_cuda_libs:
-            cuda_runtime_libs.update(lib_ld_cuda_libs)
-        warn_in_case_of_duplicates(lib_ld_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(
-            f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
-            is_warning=True,
-        )
-
-    remaining_candidate_env_vars = {
-        env_var: value
-        for env_var, value in candidate_env_vars.items()
-        if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"}
-    }
-
-    cuda_runtime_libs = set()
-    for env_var, value in remaining_candidate_env_vars.items():
-        cuda_runtime_libs.update(find_cuda_lib_in(value))
-
-    if len(cuda_runtime_libs) == 0:
-        CUDASetup.get_instance().add_log_entry(
-            "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths..."
-        )
-        cuda_runtime_libs.update(find_cuda_lib_in("/usr/local/cuda/lib64"))
-
-    warn_in_case_of_duplicates(cuda_runtime_libs)
-
-    cuda_setup = CUDASetup.get_instance()
-    cuda_setup.add_log_entry(f"DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}")
-
-    return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None
-
-
-# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
-def get_cuda_version():
-    major, minor = map(int, torch.version.cuda.split("."))
-
-    if major < 11:
-        CUDASetup.get_instance().add_log_entry(
-            "CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!"
-        )
-
-    return f"{major}{minor}"
-
-
-def get_compute_capabilities():
-    ccs = []
-    for i in range(torch.cuda.device_count()):
-        cc_major, cc_minor = torch.cuda.get_device_capability(torch.cuda.device(i))
-        ccs.append(f"{cc_major}.{cc_minor}")
-
-    ccs.sort(key=lambda v: tuple(map(int, str(v).split("."))))
-
-    return ccs
-
-
-def evaluate_cuda_setup():
-    cuda_setup = CUDASetup.get_instance()
-    if "BITSANDBYTES_NOWELCOME" not in os.environ or str(os.environ["BITSANDBYTES_NOWELCOME"]) == "0":
-        cuda_setup.add_log_entry("")
-        cuda_setup.add_log_entry("=" * 35 + "BUG REPORT" + "=" * 35)
-        cuda_setup.add_log_entry(
-            ("Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n"),
-            (
-                "and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues"
-            ),
-        )
-        cuda_setup.add_log_entry("=" * 80)
-    if not torch.cuda.is_available():
-        return "libbitsandbytes_cpu.so", None, None, None
-    if torch.version.hip:
-        return "libbitsandbytes_hip_nohipblaslt.so", None, None, None
-
-    cudart_path = determine_cuda_runtime_lib_path()
-    ccs = get_compute_capabilities()
-    ccs.sort()
-    cc = ccs[-1]  # we take the highest capability
-    cuda_version_string = get_cuda_version()
-
-    cuda_setup.add_log_entry(
-        f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}."
-    )
-    cuda_setup.add_log_entry(
-        "CUDA SETUP: To manually override the PyTorch CUDA version please see:"
-        "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md"
-    )
-
-    # 7.5 is the minimum CC vor cublaslt
-    has_cublaslt = is_cublasLt_compatible(cc)
-
-    # TODO:
-    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
-    # (2) Multiple CUDA versions installed
-
-    # we use ls -l instead of nvcc to determine the cuda version
-    # since most installations will have the libcudart.so installed, but not the compiler
-
-    if has_cublaslt:
-        binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so"
-    else:
-        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
-        binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
-
-    return binary_name, cudart_path, cc, cuda_version_string

From 31b1cbc51eaa8802c225cab8547d38661b757036 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:49:57 +0000
Subject: [PATCH 146/234] Remove test_delete_later.c

---
 csrc/test_delete_later.c | 375 ---------------------------------------
 1 file changed, 375 deletions(-)
 delete mode 100644 csrc/test_delete_later.c

diff --git a/csrc/test_delete_later.c b/csrc/test_delete_later.c
deleted file mode 100644
index 21dab4580..000000000
--- a/csrc/test_delete_later.c
+++ /dev/null
@@ -1,375 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-#if BUILD_CUDA
-#include <ops.cuh>
-#endif
-#include <cpu_ops.h>
-
-// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
-// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
-// maintain all that boilerplate
-//===================================================================================
-//                               UNMANGLED CALLS
-//===================================================================================
-
-#if BUILD_CUDA
-void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
-void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
-
-
-//void gemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
-//{ gemm_host<float>(M, N, K, A, B, out, lda, ldb, ldc, 32); }
-void gemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
-{ gemm_host<half>(M, N, K, A, B, out, lda, ldb, ldc, 16); }
-
-void gemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
-{ gemm_4bit_inference<half>(m, n, k, A, B, absmax,  out, lda, ldb, ldc, blocksize); }
-
-#define MAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
-void fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ func<ctype, FUNC>(A, B, value, n); } \
-
-MAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
-MAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
-MAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
-MAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
-
-
-#define MAKE_FUNC32(fname, oname, gtype, gbits) \
-void fname##32bit_grad_##gbits(gtype *g, gtype *p, \
-               float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
-               const float beta1, const float beta2, const float eps, const float weight_decay, \
-               const int step, const float lr, float gnorm_scale, bool skip_zeros, const int n) \
-{ optimizer32bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
-
-MAKE_FUNC32(momentum, MOMENTUM, float, 32)
-MAKE_FUNC32(momentum, MOMENTUM, half, 16)
-MAKE_FUNC32(adam, ADAM, float, fp32)
-MAKE_FUNC32(adam, ADAM, half, fp16)
-MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16)
-MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
-MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
-MAKE_FUNC32(lion, LION, float, fp32)
-MAKE_FUNC32(lion, LION, half, fp16)
-MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16)
-MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
-MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
-
-#define MAKE_FUNC8(fname, oname, gtype, gbits) \
-void fname##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
-								float *unorm, float max_unorm, float param_norm, \
-                float beta1, float beta2, \
-                float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, \
-                float* max1, float* max2, float* new_max1, float* new_max2, \
-                float weight_decay, float gnorm_scale, int n) \
-{  \
-	optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
-			                                  quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
-} \
-
-MAKE_FUNC8(adam, ADAM, float, 32)
-MAKE_FUNC8(adam, ADAM, half, 16)
-MAKE_FUNC8(momentum, MOMENTUM, float, 32)
-MAKE_FUNC8(momentum, MOMENTUM, half, 16)
-MAKE_FUNC8(rmsprop, RMSPROP, float, 32)
-MAKE_FUNC8(rmsprop, RMSPROP, half, 16)
-MAKE_FUNC8(lion, LION, float, 32)
-MAKE_FUNC8(lion, LION, half, 16)
-
-#define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \
-void fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)\
-{	optimizerStatic8bitBlockwise<gtype, optim_name>(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); }\
-
-MAKE_BLOCKWISE8(adam, ADAM, half, fp16)
-MAKE_BLOCKWISE8(adam, ADAM, float, fp32)
-MAKE_BLOCKWISE8(momentum, MOMENTUM, half, fp16)
-MAKE_BLOCKWISE8(momentum, MOMENTUM, float, fp32)
-MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
-MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
-MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
-MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
-MAKE_BLOCKWISE8(lion, LION, half, fp16)
-MAKE_BLOCKWISE8(lion, LION, float, fp32)
-MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
-
-
-void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
-void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
-
-void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-
-void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, General8bit>(code, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, General8bit>(code, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, FP4>(NULL, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, FP4>(NULL, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, NF4>(NULL, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
-
-
-#define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
-void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
-{ \
-	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
-} \
-
-MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8);
-MAKE_FUNC_TRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32);
-MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
-MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
-MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
-
-void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
-void transform_row2col32T(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 1>(A, out, rows, cols); }
-void transform_row2turing(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 0>(A, out, rows, cols); }
-void transform_row2turingT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 1>(A, out, rows, cols); }
-void transform_row2ampere(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 0>(A, out, rows, cols); }
-void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 1>(A, out, rows, cols); }
-
-void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
-void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
-
- int igemmlt_turing_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-{ spmm_coo_very_sparse_naive<half, 16>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-void spmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-{ spmm_coo_very_sparse_naive<signed char, 8>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-#endif
-
-extern "C"
-{
-#if BUILD_CUDA
-	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
-	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
-	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
-	void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
-  void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
-
-  void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
-
-  void cquantize_blockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
-
-	#define MAKE_CFUNC32(name, gtype, gbits) \
-	void c##name##32bit_grad_##gbits(gtype *g, gtype *p, \
-								 float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
-								 const float beta1, const float beta2, const float eps, const float weight_decay, \
-								 const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \
-	{ name##32bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
-
-	MAKE_CFUNC32(adam, float, fp32)
-	MAKE_CFUNC32(adam, half, fp16)
-	MAKE_CFUNC32(adam, __nv_bfloat16, bf16)
-	MAKE_CFUNC32(momentum, float, 32)
-	MAKE_CFUNC32(momentum, half, 16)
-	MAKE_CFUNC32(rmsprop, float, 32)
-	MAKE_CFUNC32(rmsprop, half, 16)
-	MAKE_CFUNC32(lion, float, fp32)
-	MAKE_CFUNC32(lion, half, fp16)
-	MAKE_CFUNC32(lion, __nv_bfloat16, bf16)
-	MAKE_CFUNC32(adagrad, float, 32)
-	MAKE_CFUNC32(adagrad, half, 16)
-
-	#define MAKE_CFUNC8(name, gtype, gbits) \
-	void c##name##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
-                float *unorm, float max_unorm, float param_norm, \
-                float beta1, float beta2, \
-                float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, \
-                float* max1, float* max2, float* new_max1, float* new_max2, \
-                float weight_decay, float gnorm_scale, int n) \
-  {  \
-	    name##_static_8bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
-			                                 quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
-  } \
-
-	MAKE_CFUNC8(adam, float, 32)
-	MAKE_CFUNC8(adam, half, 16)
-	MAKE_CFUNC8(momentum, float, 32)
-	MAKE_CFUNC8(momentum, half, 16)
-	MAKE_CFUNC8(rmsprop, float, 32)
-	MAKE_CFUNC8(rmsprop, half, 16)
-	MAKE_CFUNC8(lion, float, 32)
-	MAKE_CFUNC8(lion, half, 16)
-
-  #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
-  void c##fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \
-  {	fname##_8bit_blockwise_grad_##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \
-
-	MAKE_CBLOCKWISE8(adam, ADAM, half, fp16)
-	MAKE_CBLOCKWISE8(adam, ADAM, float, fp32)
-	MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, fp16)
-	MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, fp32)
-	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, fp16)
-	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
-	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
-	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-	MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
-	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
-	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
-	MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
-
-	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
-	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
-	void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
-
-	void cigemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc)
-	{ gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc); }
-	void cbatched_igemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
-			               long strideA, long strideB, long strideC, int batchCount)
-	{ strided_gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc, strideA, strideB, strideC, batchCount); }
-
-	Context *get_context(){ return new Context(); }
-	ContextCusparse *get_cusparse(){ return new ContextCusparse(); }
-
-	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-	//{ (cublasLtHandle_t)context->m_handle; return 0; }
-	//{ return 0; }//igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
-	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
-	{ \
-		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
-	} \
-
-	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
-	MAKE_FUNC_CTRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32)
-	MAKE_FUNC_CTRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
-	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
-
-	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
-	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
-	void cget_col_row_stats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
-	{ getColRowStats(A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols); }
-
-  void cdouble_rowcol_quant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_row_ptr, float threshold, int rows, int cols)
-	{ doubleRowColQuant(A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_row_ptr, threshold, rows, cols); }
-
-	void ctransform_row2col32(char * A, char *out, int rows, int cols)
-	{ transform_row2col32(A, out, rows, cols); }
-
-	void ctransform_row2col32T(char * A, char *out, int rows, int cols)
-	{ transform_row2col32T(A, out, rows, cols); }
-
-	void ctransform_row2turing(char * A, char *out, int rows, int cols)
-	{ transform_row2turing(A, out, rows, cols); }
-
-	void ctransform_row2turingT(char * A, char *out, int rows, int cols)
-	{ transform_row2turingT(A, out, rows, cols); }
-
-	void ctransform_row2ampere(char * A, char *out, int rows, int cols)
-	{ transform_row2ampere(A, out, rows, cols); }
-
-	void ctransform_row2ampereT(char * A, char *out, int rows, int cols)
-	{ transform_row2ampereT(A, out, rows, cols); }
-
-	void cspmm_coo(ContextCusparse *context, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
-  { spmm_coo((hipsparseHandle_t) context->m_handle, A_rowidx, A_colidx, A_vals, A_nnz, A_rows, A_cols, B_cols, ldb, B, ldc, C, transposed_B); }
-
-	void cspmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-	{ spmm_coo_very_sparse_naive_fp16(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-	void cspmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-	{ spmm_coo_very_sparse_naive_int8(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-	void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); }
-	void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); }
-
-	//void cgemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
-	//{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); }
-
-	void cgemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
-	{ gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); }
-
-	void cgemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
-	{ gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); }
-
-	void *cget_managed_ptr(size_t bytes)
-	{
-		void *ptr;
-		CUDA_CHECK_RETURN(hipMallocManaged(&ptr, bytes, hipMemAttachHost));
-		CUDA_CHECK_RETURN(hipPeekAtLastError());
-
-		return ptr;
-	}
-
-	void cprefetch(void *ptr, size_t bytes, int device)
-	{
-		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
-		CUDA_CHECK_RETURN(hipPeekAtLastError());
-	}
-
-  #define CMAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
-	void c##fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ fname##_##type_name(A, B, value, n); } \
-
-	CMAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
-	CMAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
-	CMAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
-	CMAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
-
-#endif
-	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
-	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
-}

From ed774769b4787f22a7ee4df46d985925bbaca77d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 21:59:14 +0000
Subject: [PATCH 147/234] Sync with upstream

---
 tests/test_functional.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 99cbf6f75..21dae281a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,8 +11,15 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
+from tests.helpers import (
+    BOOLEAN_TUPLES,
+    TRUE_FALSE,
+    describe_dtype,
+    get_blocksizes,
+    get_test_dims,
+    id_formatter,
+)
 from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT, ROCM_GPU_ARCH
-from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
 k = 20

From 943c57a26c4644cca92da6f9ec256dd7280f381b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Thu, 9 May 2024 23:56:04 +0000
Subject: [PATCH 148/234] Sync files with upstream

---
 include/Algo-Direct2.h | 16 ++++++++--------
 include/SIMD.h         |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index 91dded6f4..547ca9955 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -94,8 +94,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
 #endif
         IVec<SSE, float> i(u.vec);
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<SSE, float> vlem = vz < vxm;
+        IVec<SSE, float> vlep = vz < vxp;
         i = i + vlem + vlep;
         i.store(pr);
     }
@@ -124,8 +124,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3);
 
         IVec<SSE, double> i(b1, b0);
-        IVec<SSE, double> vlem = operator< (vz, vxm);
-        IVec<SSE, double> vlep = operator< (vz, vxp);
+        IVec<SSE, double> vlem = (vz < vxm);
+        IVec<SSE, double> vlep = (vz < vxp);
         i = i + vlem + vlep;
 
         union {
@@ -229,8 +229,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 
 #endif
 
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<AVX, float> vlem = vz < vxm;
+        IVec<AVX, float> vlep = vz < vxp;
         ip = ip + vlem + vlep;
 
         ip.store(pr);
@@ -279,8 +279,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 //        FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1);
 
         IVec<AVX, double> i(u.vec);
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<AVX, double> vlem = vz < vxm;
+        IVec<AVX, double> vlep = vz < vxp;
         i = i + vlem + vlep;
         i.extractLo32s().store(pr);
     }
diff --git a/include/SIMD.h b/include/SIMD.h
index e97f5fc33..9d1410c73 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -307,7 +307,7 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
-#if !defined(__clang__) || defined(__HIP_PLATFORM_AMD__) // Conflicts with builtin operator
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
@@ -363,7 +363,7 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
-#if !defined(__clang__) || defined(__HIP_PLATFORM_AMD__) // Conflicts with builtin operator
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }

From 71d17023fadf0e9290577a519901c279bc93eccc Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 10 May 2024 00:37:27 +0000
Subject: [PATCH 149/234] Fix lint errors

---
 tests/test_functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 21dae281a..8ddee9f9a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -11,6 +11,7 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
+from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT, ROCM_GPU_ARCH
 from tests.helpers import (
     BOOLEAN_TUPLES,
     TRUE_FALSE,
@@ -19,7 +20,6 @@
     get_test_dims,
     id_formatter,
 )
-from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT, ROCM_GPU_ARCH
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
 k = 20

From 6886bc8f4a28a580f49b814e45728765415f2ea1 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Wed, 8 May 2024 17:25:53 +0000
Subject: [PATCH 150/234] Exclude hip files from typo checks

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a859d05af..9babbc0cc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,4 @@ repos:
     rev: v1.18.2
     hooks:
       - id: typos
+        exclude: ^.*\.hip$

From 0d445f4fba98d0c6565817e7013ef9f910c2164a Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 10 May 2024 00:47:42 +0000
Subject: [PATCH 151/234] update ops.hip

---
 csrc/ops.hip | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/ops.hip b/csrc/ops.hip
index 67cece5c1..157e84629 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -64,7 +64,7 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   if(blocksize == 4096)
-   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, 0>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 2048)
    hipLaunchKernelGGL(( kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(512), 0, 0, code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 1024)

From 177bd398b3235f586e9e2110b6ffe8288eef4f00 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 00:22:04 -0700
Subject: [PATCH 152/234] Minor improvements

---
 bitsandbytes/backends/cpu.py            | 1 +
 bitsandbytes/backends/cpu_xpu_common.py | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index a5e123e62..80b6c241e 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -136,6 +136,7 @@ def quantize_4bit(
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         assert_on_cpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
         return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 078b81680..ab881c6dd 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -343,6 +343,8 @@ def quantize_4bit_impl(
         )
 
     if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+        # lowp_mode: lowest precision for computation
+        lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
         state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
             out.reshape([input_shape[0], input_shape[1] // 2]),
             ipex_cpu.quantization.WoqWeightDtype.NF4,
@@ -353,8 +355,8 @@ def quantize_4bit_impl(
             None, # g_idx
             None, # batch_size
             blocksize,
-            int(ipex_cpu.quantization.WoqLowpMode.BF16),
-            -1, # act_quant_mode
+            int(lowp_mode),
+            -1, # act_quant_mode. -1 means don't quant activation
         )
 
     return out, state

From 15c7f77913f72004f0e4e1a147f9d78b596386ff Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 10 May 2024 15:24:37 +0000
Subject: [PATCH 153/234] Add install steps for ROCm

---
 docs/source/rocm_installation.mdx | 46 +++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 docs/source/rocm_installation.mdx

diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
new file mode 100644
index 000000000..476cbae07
--- /dev/null
+++ b/docs/source/rocm_installation.mdx
@@ -0,0 +1,46 @@
+# ROCm Installation
+
+Please follow these steps to install bitsandbytes on ROCm. 
+
+<hfoptions id="rocm_install">
+<hfoption id="latest">
+
+For latest installation:
+
+```bash
+git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install .
+```
+
+<hfoption>
+<hfoption id="version specific">
+
+For ROCm specific versions:
+
+Install Dependencies:
+
+```bash
+# hipblaslt installation needed only for rocm<6.0
+apt install hipblaslt
+pip install --upgrade pip
+pip install einops lion_pytorch accelerate
+pip install git+https://github.com/ROCm/transformers.git
+```
+
+Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
+
+```bash
+git clone --recurse https://github.com/ROCm/bitsandbytes
+cd bitsandbytes
+# Checkout branch as needed
+# for rocm 5.7 - rocm5.7_internal_testing
+# for rocm 6.x - rocm6.2_internal_testing
+git checkout <branch>
+make hip
+python setup.py install
+```
+<hfoption>
+</hfoptions>

From d62c83589f6d318369a9d3a3c626eb5221ba079b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Fri, 10 May 2024 15:28:56 +0000
Subject: [PATCH 154/234] Fix lint error

---
 docs/source/rocm_installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
index 476cbae07..5d4381e7d 100644
--- a/docs/source/rocm_installation.mdx
+++ b/docs/source/rocm_installation.mdx
@@ -1,6 +1,6 @@
 # ROCm Installation
 
-Please follow these steps to install bitsandbytes on ROCm. 
+Please follow these steps to install bitsandbytes on ROCm.
 
 <hfoptions id="rocm_install">
 <hfoption id="latest">

From 881b5fcd0bc77f747850f397a0bf02c288332c17 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 22:34:32 -0700
Subject: [PATCH 155/234] Add fp4 support; add UT; fix lint issues

---
 bitsandbytes/backends/cpu.py            |   4 +-
 bitsandbytes/backends/cpu_xpu_common.py | 109 ++++++++++++++----------
 tests/test_functional.py                |  50 ++++++++++-
 3 files changed, 114 insertions(+), 49 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 80b6c241e..2c3688251 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -6,12 +6,12 @@
 
 from .base import Backend
 from .cpu_xpu_common import (
+    dequantize_4bit_impl,
     double_quant_impl,
+    gemm_4bit_impl,
     igemmlt_impl,
     mm_dequant_impl,
     quantize_4bit_impl,
-    dequantize_4bit_impl,
-    gemm_4bit_impl,
 )
 
 Tensor = torch.Tensor
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index ab881c6dd..8d87f7e2f 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,11 +1,11 @@
+from typing import Optional
 import warnings
+
 import torch
-from typing import Optional
+
 from bitsandbytes.functional import (
-    get_4bit_type,
-    quantize_blockwise,
-    dequantize_blockwise,
     QuantState,
+    get_4bit_type,
 )
 
 try:
@@ -237,25 +237,37 @@ def mm_dequant_impl(
 
 
 NF4_QUANT_TABLE = [
-    -1.0 - 1e-2,           # 0b0000
-    -0.8480964004993439,   # 0b0001
-    -0.6106329262256622,   # 0b0010
-    -0.4599952697753906,   # 0b0011
+    -1.0 - 1e-2,  # 0b0000
+    -0.8480964004993439,  # 0b0001
+    -0.6106329262256622,  # 0b0010
+    -0.4599952697753906,  # 0b0011
     -0.33967943489551544,  # 0b0100
     -0.23460740596055984,  # 0b0101
     -0.13791173323988914,  # 0b0110
-    -0.045525018125772476, # 0b0111
-    0.03979014977812767,   # 0b1000
-    0.1202552504837513,    # 0b1001
-    0.2035212516784668,    # 0b1010
-    0.2920137718319893,    # 0b1011
-    0.3893125355243683,    # 0b1100
-    0.5016634166240692,    # 0b1101
-    0.6427869200706482,    # 0b1110
-    0.8614784181118011,    # 0b1111
+    -0.045525018125772476,  # 0b0111
+    0.03979014977812767,  # 0b1000
+    0.1202552504837513,  # 0b1001
+    0.2035212516784668,  # 0b1010
+    0.2920137718319893,  # 0b1011
+    0.3893125355243683,  # 0b1100
+    0.5016634166240692,  # 0b1101
+    0.6427869200706482,  # 0b1110
+    0.8614784181118011,  # 0b1111
 ]
 
 
+FP4_QUANT_TABLE = {
+    0 - 1e-2: 0,  # 0b0000
+    0.00260417: 1,  # 0b0001
+    0.0859375: 6,  # 0b0110
+    0.20833333: 7,  # 0b0111
+    0.29166667: 4,  # 0b0100
+    0.4166667: 5,  # 0b0101
+    0.583333: 2,  # 0b0010
+    0.8333333: 3,  # 0b0011
+}
+
+
 # It's faster not to use torch.compile
 def quantize_4bit_impl(
     A: Tensor,
@@ -290,10 +302,11 @@ def quantize_4bit_impl(
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    if quant_type != "nf4":
-        raise NotImplementedError(
-            f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU."
-        )
+    if quant_type not in ["nf4", "fp4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
+    if quant_type == "fp4":
+        warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
     n = A.numel()
     input_shape = A.shape
     blocks = n // blocksize
@@ -305,25 +318,31 @@ def quantize_4bit_impl(
     if out is None:
         out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
 
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
     rem = n % blocksize
     has_rem = rem > 0
 
     # Scale tensor to [-1, 1]
     A_reshaped = A.reshape(n)
-    A_com = A_reshaped[:n - rem]
+    A_com = A_reshaped[: n - rem]
     A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
-    absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
-    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].view(-1, 1)), -1, 1)
+    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
     scaled_A = scaled_A.reshape(-1)
     if has_rem:
-        absmax[-1] = torch.abs(A_reshaped[n - rem:]).max()
-        scaled_A_rem = torch.clamp(A_reshaped[n - rem:] * (1 / absmax[-1]), -1, 1)
+        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
         scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
-    # map [-1, 1] to nf4
+    # map [-1, 1] to nf4/fp4
     out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
-    for i in range(len(NF4_QUANT_TABLE)):
-        out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    if quant_type == "nf4":
+        for i in range(len(NF4_QUANT_TABLE)):
+            out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    elif quant_type == "fp4":
+        sign = scaled_A < 0
+        abs_scaled_A = torch.abs(scaled_A)
+        for key, val in FP4_QUANT_TABLE.items():
+            out_uint8[abs_scaled_A > key] = val
+        out_uint8 += sign.to(torch.uint8) * 8
     if out_uint8.size(-1) % 2:
         out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
     out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
@@ -342,21 +361,21 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and input_shape[1] % blocksize == 0 and quant_type == "nf4":
         # lowp_mode: lowest precision for computation
         lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
         state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
             out.reshape([input_shape[0], input_shape[1] // 2]),
             ipex_cpu.quantization.WoqWeightDtype.NF4,
-            input_shape, # weight shape
-            absmax.view(input_shape[0], input_shape[1] // blocksize), # scales
-            None, # zero_points
-            None, # bias
-            None, # g_idx
-            None, # batch_size
+            input_shape,  # weight shape
+            absmax.view(input_shape[0], input_shape[1] // blocksize),  # scales
+            None,  # zero_points
+            None,  # bias
+            None,  # g_idx
+            None,  # batch_size
             blocksize,
             int(lowp_mode),
-            -1, # act_quant_mode. -1 means don't quant activation
+            -1,  # act_quant_mode. -1 means don't quant activation
         )
 
     return out, state
@@ -365,7 +384,7 @@ def quantize_4bit_impl(
 @_maybe_torch_compile
 def dequantize_4bit_impl(
     A: Tensor,
-    quant_state = None,
+    quant_state=None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -412,7 +431,7 @@ def dequantize_4bit_impl(
     else:
         absmax = quant_state.absmax
 
-    if quant_state.quant_type != "nf4":
+    if quant_type not in ["nf4", "fp4"]:
         raise NotImplementedError(
             f"4-bit quantization data type {quant_state.quant_type} is not implemented for CPU/XPU."
         )
@@ -421,9 +440,7 @@ def dequantize_4bit_impl(
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
     if out is None:
-        out = torch.empty(
-            quant_state.shape, dtype=quant_state.dtype, device=A.device
-        )
+        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
 
     n = out.numel()
     # Map nf4 to [-1, 1]
@@ -443,9 +460,11 @@ def dequantize_4bit_impl(
     rem = n % blocksize
     has_rem = rem > 0
     out_reshaped = out.reshape(-1)
-    out_reshaped[:n - rem] = (out_dq[:n - rem].view(-1, blocksize) * absmax[:blocks - has_rem].view(-1, 1)).reshape(-1)
+    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
+        -1
+    )
     if has_rem:
-        out_reshaped[n - rem:] = out_dq[n - rem:] * absmax[-1]
+        out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
 
     # take transpose here because weight is transposed (again) for computation
     return out.t()
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8e125f712..ea15f148a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2003,7 +2003,8 @@ def test_bench_dequantization():
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
 @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
 @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
-def test_4bit_quant(dtype, quant_type, blocksize):
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_4bit_quant(dtype, quant_type, blocksize, device):
     vals = list(product([0, 1], repeat=4))
 
     code = {}
@@ -2027,9 +2028,11 @@ def test_4bit_quant(dtype, quant_type, blocksize):
             result = sign * exp * frac
         code[idx] = result
 
-    A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype)
+    A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
     qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
     A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
+    if device == "cpu":
+        A2 = A2.t()
 
     err = (A1 - A2).abs().float()
     relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -2279,6 +2282,49 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
+def test_gemv_4bit_cpu(dtype, quant_type, kind):
+    """
+    Test 4bit GEMV for CPU. It is simplified a lot from the cuda version, since
+    the CPU backend does not support double_quant or quant_storage other than uint8.
+    Also, the CPU backend has different numeric accuracy from that of CUDA
+    """
+    for dim in [128, 256, 512, 1024]:
+        for i in range(10):
+            if kind == "fc1":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 4, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "fc2":
+                A = torch.randn(1, 4 * dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, 4 * dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn_packed":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 3, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+
+            qB, state = F.quantize_4bit(
+                B,
+                quant_type=quant_type,
+                compress_statistics=False,
+                quant_storage=torch.uint8,
+            )
+            dqB = F.dequantize_4bit(qB, state)
+            C3 = torch.matmul(A, dqB)
+            C2 = F.gemv_4bit(A, qB.t(), state=state)
+            A.requires_grad = True
+            C1 = bnb.matmul_4bit(A, qB.t(), state)
+
+            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
+            rtol = 1e-3 if dtype != torch.bfloat16 else 1e-2
+            atol = 1e-2 if dtype != torch.bfloat16 else 5e-2
+            assert_all_approx_close(C1, C2, rtol, atol, count=c)
+            assert_all_approx_close(C3, C2, rtol, atol, count=c)
+
+
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_managed():
     n = 32 * 10

From dd15734709f131b4c1e3244ba28e632dbf5a3ed6 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 23:57:25 -0700
Subject: [PATCH 156/234] Reduce memory usage

---
 bitsandbytes/backends/cpu_xpu_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 8d87f7e2f..426d07975 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -377,6 +377,7 @@ def quantize_4bit_impl(
             int(lowp_mode),
             -1,  # act_quant_mode. -1 means don't quant activation
         )
+        return torch.Tensor(), state
 
     return out, state
 

From 85a01b00fc131a586dec8fec5d25d753a471006c Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sat, 11 May 2024 00:42:31 -0700
Subject: [PATCH 157/234] Fix UT

---
 bitsandbytes/backends/cpu_xpu_common.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 426d07975..7c35a85c3 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -440,6 +440,11 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
+        assert quant_state.op_context is not None
+        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
+        A = A.reshape(-1)
+
     if out is None:
         out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
 
@@ -503,7 +508,7 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and hasattr(state, "op_context"):
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
         assert state.op_context is not None
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:

From 2c489f8dde8e5992af5aa0956e1a4cb9554b72eb Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sat, 11 May 2024 00:54:17 -0700
Subject: [PATCH 158/234] reduce memory usage for nf4

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 7c35a85c3..138ec72f5 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -377,6 +377,7 @@ def quantize_4bit_impl(
             int(lowp_mode),
             -1,  # act_quant_mode. -1 means don't quant activation
         )
+        state.absmax = torch.Tensor()
         return torch.Tensor(), state
 
     return out, state
@@ -444,6 +445,7 @@ def dequantize_4bit_impl(
         assert quant_state.op_context is not None
         A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
         A = A.reshape(-1)
+        absmax = quant_state.op_context.get_scales().reshape(-1)
 
     if out is None:
         out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)

From 410f4998907cd7b4b7978b4a19c98afea66a80da Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.nunna@amd.com>
Date: Wed, 15 May 2024 19:28:06 +0000
Subject: [PATCH 159/234] Add comments for HIP changes

---
 bitsandbytes/backends/cuda.py | 14 ++++++++++++++
 bitsandbytes/functional.py    | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index 57f9e953f..ad478431c 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -186,6 +186,8 @@ def transform(
         ld=None,
     ):
         if HIP_ENVIRONMENT:
+            # transform kernel formats (col32/col_turing/col_ampere) are not applicable to ROCm
+            # Use nvidia_transform instead
             return nvidia_transform(A, to_order, from_order, out, transpose, state, ld)
 
         prev_device = pre_call(A.device)
@@ -271,11 +273,13 @@ def igemmlt(
 
         if dimsA == 2 and out is None:
             if HIP_ENVIRONMENT:
+                # Use col format for HIP
                 out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col", "row")
             else:
                 out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
         elif dimsA == 3 and out is None:
             if HIP_ENVIRONMENT:
+                # Use col format for HIP
                 out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col", "row")
             else:
                 out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
@@ -287,6 +291,7 @@ def igemmlt(
         assert B.dtype == torch.int8
         assert out.dtype == dtype
         if HIP_ENVIRONMENT:
+            # Use col format for HIP
             assert SA[1] == "col"
             assert SB[1] == "col"
             assert Sout[1] == "col"
@@ -309,6 +314,7 @@ def igemmlt(
 
         k = shapeA[-1]
         if HIP_ENVIRONMENT:
+            # Set ld values for col format
             lda = ct.c_int32(m)
             ldb = ct.c_int32(shapeB[0])
             ldc = ct.c_int32(m)
@@ -369,6 +375,7 @@ def mm_dequant(
         bias: Optional[torch.Tensor] = None,
     ):
         if HIP_ENVIRONMENT:
+            # HIP kernel requires 'row' format
             A, quant_state = nvidia_transform(A, "row", state=quant_state)
         assert A.dtype == torch.int32
         if bias is not None:
@@ -411,6 +418,7 @@ def extract_outliers(self, A: torch.Tensor, SA: Tuple[torch.Size, str], idx: tor
         if not HIP_ENVIRONMENT:
             assert formatA in ["col_turing", "col_ampere"]
         else:
+            # HIP uses col format
             assert formatA in ["col"]
         assert A.device.type == "cuda"
 
@@ -445,6 +453,8 @@ def quantize_4bit(
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         if blocksize is None:
+            # Some AMD GPUs have warpsize 64
+            # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
             blocksize = 64 if not HIP_ENVIRONMENT else 128
         if A.device.type != "cuda":
             raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
@@ -463,6 +473,8 @@ def quantize_4bit(
             mod = dtype2bytes[quant_storage] * 2
             out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
 
+        # Some AMD GPUs have warpsize 64
+        # Set min blocksize to 128 (~warpsize 64 in kernel) for HIP
         if not HIP_ENVIRONMENT:
             assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
         else:
@@ -540,6 +552,8 @@ def dequantize_4bit(
         blocksize: Optional[int] = None,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
         if blocksize is None:
             blocksize = 64 if not HIP_ENVIRONMENT else 128
         supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index c807ba17a..2041589b3 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -479,6 +479,8 @@ def nvidia_transform(
     ld=None,
 ):
     if HIP_ENVIRONMENT:
+        # col32/col_turing/col_ampere are not applicable to ROCm
+        # Use col format instead
         to_order = "col" if to_order in ["col32", "col_turing", "col_ampere"] else to_order
         from_order = "col" if from_order in ["col32", "col_turing", "col_ampere"] else from_order
 
@@ -630,6 +632,8 @@ def quantize_blockwise(
         out = torch.zeros_like(A, dtype=torch.uint8)
 
     if A.device.type != "cpu":
+        # Some AMD GPUs have warpsize 64
+        # Set min blocksize to 128 (~warpsize 64 in kernel) for HIP
         if not HIP_ENVIRONMENT:
             assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
         else:
@@ -755,6 +759,8 @@ def dequantize_blockwise(
         device = pre_call(A.device)
         code = quant_state.code.to(A.device)
         supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        # Some AMD GPUs have warpsize 64
+        # Set min blocksize to 128 (~warpsize 64 in kernel) for HIP
         if HIP_ENVIRONMENT:
             supported_blocksizes = supported_blocksizes[:-1]
         if quant_state.blocksize not in supported_blocksizes:
@@ -897,6 +903,8 @@ def quantize_fp4(
     quant_storage=torch.uint8,
 ):
     if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
         blocksize = 64 if not HIP_ENVIRONMENT else 128
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
 
@@ -910,6 +918,8 @@ def quantize_nf4(
     quant_storage=torch.uint8,
 ):
     if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
         blocksize = 64 if not HIP_ENVIRONMENT else 128
     return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
 
@@ -968,6 +978,8 @@ def dequantize_fp4(
     blocksize: Optional[int] = None,
 ) -> Tensor:
     if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
         blocksize = 64 if not HIP_ENVIRONMENT else 128
 
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -981,6 +993,8 @@ def dequantize_nf4(
     blocksize: Optional[int] = None,
 ) -> Tensor:
     if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
         blocksize = 64 if not HIP_ENVIRONMENT else 128
 
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")

From ccee5d894b8aed4c24976a5622140a930d1f5574 Mon Sep 17 00:00:00 2001
From: statelesshz <hzji210@gmail.com>
Date: Mon, 27 May 2024 14:22:54 +0800
Subject: [PATCH 160/234] Add empty stubs for Ascend NPU

---
 bitsandbytes/__init__.py     |   6 +-
 bitsandbytes/backends/npu.py | 170 +++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 bitsandbytes/backends/npu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 760a8eda4..eff7fc686 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -16,6 +16,7 @@
 )
 from .backends import register_backend
 from .backends.cpu import CPUBackend
+from .backends.npu import NPUBackend
 from .cextension import lib
 from .nn import modules
 
@@ -49,11 +50,14 @@
 
     register_backend("xpu", XPUBackend())
 
+# Register Ascend NPU backend, if available.
+if hasattr(torch, "npu") and torch.npu.is_available():
+    register_backend("npu", NPUBackend())
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
 # IPU - Graphcore
-# NPU - Ascend
 # Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
 # In this case, it will be up to each backend to dispatch as needed
 
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
new file mode 100644
index 000000000..1b3cb57d6
--- /dev/null
+++ b/bitsandbytes/backends/npu.py
@@ -0,0 +1,170 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+try:
+    # to support Ascend NPU backend
+    import torch_npu  # noqa: F401
+except ImportError:
+    pass
+
+
+class NPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError

From 36fe1a0cc768686cb6e3d864573eae822509d21d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 13:58:37 -0400
Subject: [PATCH 161/234] fix blocksize

---
 bitsandbytes/backends/cpu.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 2c3688251..5d38171d5 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -135,6 +135,8 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        if blocksize is None:
+            blocksize = 64
         assert_on_cpu([A, absmax, out])
         assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
         return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
@@ -148,6 +150,8 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
+        if blocksize is None:
+            blocksize = 64
         assert_on_cpu([A, absmax, out])
         return dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 

From 517eaf2b5b789033dab1cd85459057129e6a0b19 Mon Sep 17 00:00:00 2001
From: Xia Weiwen <xia.weiwen@hotmail.com>
Date: Thu, 6 Jun 2024 22:09:13 +0800
Subject: [PATCH 162/234] CPU: add torch.compile for F.double_quant and
 F.quantize_4bit (#1238)

---
 bitsandbytes/backends/cpu_xpu_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 138ec72f5..396234853 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -55,7 +55,7 @@ def _maybe_torch_compile(func):
     return func
 
 
-# Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
+@_maybe_torch_compile
 def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     """
     Find absolute max values of each row/column of a tensor, and symmetrically quantize it to int8.
@@ -268,7 +268,7 @@ def mm_dequant_impl(
 }
 
 
-# It's faster not to use torch.compile
+@_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,

From 193120d1677ff0c4c502fc81835251e4b29d0c48 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:48:44 +0200
Subject: [PATCH 163/234] cleanup docs-build breaking install instructs (#1244)

* cleanup docs-build breaking install instructs

* Update install instructions for ROCm

* Update installation.mdx

---------

Co-authored-by: Prasanth Nunna <Prasanth.Nunna@amd.com>
Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
---
 docs/source/installation.mdx      | 43 +++++++++++++++++++++++++++++
 docs/source/rocm_installation.mdx | 46 -------------------------------
 2 files changed, 43 insertions(+), 46 deletions(-)
 delete mode 100644 docs/source/rocm_installation.mdx

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c07ef29f6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -91,6 +91,49 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
+## Multi-backend preview release (+ compilation)
+
+Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+For a ROCm specific install:
+
+bitsandbytes is fully supported from ROCm 6.1.
+
+**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+
+```bash
+# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
+docker pull rocm/pytorch:latest
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+
+# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+WIP
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+WIP
+
+</hfoption>
+</hfoptions>
+
 ## PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
deleted file mode 100644
index 5d4381e7d..000000000
--- a/docs/source/rocm_installation.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-# ROCm Installation
-
-Please follow these steps to install bitsandbytes on ROCm.
-
-<hfoptions id="rocm_install">
-<hfoption id="latest">
-
-For latest installation:
-
-```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
-make
-pip install .
-```
-
-<hfoption>
-<hfoption id="version specific">
-
-For ROCm specific versions:
-
-Install Dependencies:
-
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-
-Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
-
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-<hfoption>
-</hfoptions>

From c79b1e926b05e856775e4962253e2dbf67bed103 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 19:02:47 +0200
Subject: [PATCH 164/234] provide temp flag for outside libs to detect
 multi-backend preview (#1243)

* provide temp flag for outside libs to detect multi-backend preview

* fix typo in comment

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 bitsandbytes/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index eff7fc686..c3a2f2402 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -20,6 +20,13 @@
 from .cextension import lib
 from .nn import modules
 
+# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
+# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
+# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
+# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
+# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
+is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 

From 1bfecc81e9f3b9a67a3b9bb9e1ab57468b1b9497 Mon Sep 17 00:00:00 2001
From: Xia Weiwen <xia.weiwen@hotmail.com>
Date: Wed, 10 Jul 2024 15:26:35 +0800
Subject: [PATCH 165/234] CPU/XPU: disable torch.compile if g++ is not
 available (#1251)

* CPU/XPU: disable torch.compile if g++ is not available

* Fix lint issue
---
 bitsandbytes/backends/cpu_xpu_common.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 396234853..c936dce14 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,3 +1,4 @@
+import subprocess
 from typing import Optional
 import warnings
 
@@ -19,6 +20,14 @@
     ipex_xpu = None
 
 
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
+
 Tensor = torch.Tensor
 
 
@@ -45,8 +54,8 @@ def _ipex_xpu_version_prereq(major, minor):
 
 
 def _maybe_torch_compile(func):
-    # torch.compile requires pytorch >= 2.0
-    if _torch_version_prereq(2, 0):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):

From 08597844023a5c59e9b5d5dbeafbac4174fae5cc Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 12 Jul 2024 09:15:10 -0500
Subject: [PATCH 166/234] Create build job for ROCm (#1255)

* Add build job for rocm

* Add rocm build script
---
 .github/scripts/build-rocm.sh        | 19 +++++++++++++++++++
 .github/workflows/python-package.yml | 22 ++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 .github/scripts/build-rocm.sh

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..fc7515aa7
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+
+set -xeuo pipefail
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:6.1-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake --build ."
+fi
+
+#output_dir="output/${build_os}/${build_arch}"
+#mkdir -p "${output_dir}"
+#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..78bc747c3 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -103,6 +103,28 @@ jobs:
           name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
           path: output/*
           retention-days: 7
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        if: startsWith(matrix.os, 'ubuntu')
+        uses: docker/setup-qemu-action@v2
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
   build-wheels:
     needs:
       - build-shared-libs

From 9b726798542e01c45a7a4a841e144311980b90d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 167/234] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..e446155b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for seqlen 1024, and 4888GB for 128k
+- 70B: 20.1GB for 1024 and 2516GB for 128k
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)

From 81375f8e67e9433c778fce3011930159357271c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 13:11:00 +0000
Subject: [PATCH 168/234] docs: add more details to Intel install

---
 docs/source/installation.mdx | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 5b2cfe1d3..2f8fe4db7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -31,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source
+### Compile from source[[compile]]
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -174,7 +174,18 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 > [!TIP]
 > Intel CPU backend only supports building from source; for now, please follow the instructions below.
 
-Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
+
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+
+```
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install intel_extension_for_pytorch
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
 
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">

From 24f7b652cec822849fba69c583b8e73d84446627 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:08:30 +0000
Subject: [PATCH 169/234] docs: cleanup of compilation instructions

---
 docs/source/installation.mdx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f8fe4db7..f917f2623 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. There's a multi-backend effort under way which is currently in alpha release, see further down in this document.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
 
 The latest version of bitsandbytes builds on:
 
@@ -134,7 +134,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend preview release (+ compilation)
+## Multi-backend preview release compilation[[multi-backend]]
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
@@ -143,11 +143,10 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 
 ### AMD GPU
 
-For a ROCm specific install:
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
-bitsandbytes is fully supported from ROCm 6.1.
-
-**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> [!TIP]
+> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
 
 ```bash
 # Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
@@ -161,6 +160,7 @@ git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bit
 pip install -r requirements-dev.txt
 
 # Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
 cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
 make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -179,7 +179,7 @@ Similar to the CUDA case, you can compile bitsandbytes from source for Linux and
 The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
 
 ```
-git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .

From e3b27805346b7d55a5ca4ba91fb374415c11dc05 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:16:49 +0000
Subject: [PATCH 170/234] docs: CHANGELOG.md fix

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e446155b0..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,8 +3,8 @@
 This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
 
 For each sequence (i.e. batch size increase of one) we expect memory savings of:
-- 405B: 39GB for seqlen 1024, and 4888GB for 128k
-- 70B: 20.1GB for 1024 and 2516GB for 128k
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
 
 This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
 

From c8b4b33ef40d240b9650268dfe6ae15ac5472664 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 27 Jul 2024 23:28:30 +0800
Subject: [PATCH 171/234] fix dtype mismatch (#1285)

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c936dce14..04755ed2d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -524,7 +524,7 @@ def gemm_4bit_impl(
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
-        output = torch.matmul(A, dqB)
+        output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)
     else:

From d385aeaad85c7e5f688408d2822209f5a49a2738 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 30 Jul 2024 10:06:34 +0000
Subject: [PATCH 172/234] allow features flags on bnb

---
 bitsandbytes/__init__.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 129ac1536..e4b133476 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -20,12 +20,7 @@
 from .cextension import lib
 from .nn import modules
 
-# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
-# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
-# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
-# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
-# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
-is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+features = {"multi_backend"}
 
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())

From 452749a6e3185bb4a9b10a86deeaa4d40c2c6d97 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 2 Aug 2024 06:01:37 +0800
Subject: [PATCH 173/234] Fix dequant 4bit (#1300)

* fix dequant 4bit weight

* fix 4bit dequant and matmul
---
 bitsandbytes/backends/cpu_xpu_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 04755ed2d..3c0574788 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -484,7 +484,7 @@ def dequantize_4bit_impl(
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
 
     # take transpose here because weight is transposed (again) for computation
-    return out.t()
+    return out
 
 
 # Do not need torch.compile here as we are calling torch/ipex kernel
@@ -523,7 +523,7 @@ def gemm_4bit_impl(
         assert state.op_context is not None
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:
-        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
+        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)

From a142f1ebfc3cef98e7b85c0c205309d9ca04fd54 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 2 Aug 2024 17:21:32 +0800
Subject: [PATCH 174/234] fix loading int8 model in CPU (#1303)

---
 bitsandbytes/nn/modules.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index c92b25e2c..85ce52cc5 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -639,8 +639,12 @@ def to(self, *args, **kwargs):
 
         if device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
-        elif device.type == "cpu" and self.data.dtype != torch.int8:
-            return self.cpu()
+        elif device.type == "cpu":
+            if self.data.dtype == torch.int8:
+                self.CB = self.data
+                return self
+            else:
+                return self.cpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),

From 17750358f740ac768c10c767807698581f3257ad Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 2 Aug 2024 17:22:05 +0800
Subject: [PATCH 175/234] fix transpose 4bit (#1301)

---
 bitsandbytes/backends/cpu_xpu_common.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 3c0574788..e35535ddb 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -389,7 +389,7 @@ def quantize_4bit_impl(
         state.absmax = torch.Tensor()
         return torch.Tensor(), state
 
-    return out, state
+    return out.unsqueeze(0), state
 
 
 @_maybe_torch_compile
@@ -428,6 +428,13 @@ def dequantize_4bit_impl(
         Dequantized tensor.
     """
 
+    if A.shape[0] == 1:
+        transpose = False
+        A = A.squeeze(0)
+    elif A.shape[1] == 1:
+        transpose = True
+        A = A.squeeze(1)
+
     if quant_state is None:
         assert absmax is not None and out is not None
 
@@ -484,6 +491,9 @@ def dequantize_4bit_impl(
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
 
     # take transpose here because weight is transposed (again) for computation
+    if transpose:
+        out = out.t()
+
     return out
 
 

From 6d9b69b626bf93a9ec22b068d1d4107f70979e34 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 2 Aug 2024 07:43:55 -0500
Subject: [PATCH 176/234] Enable bitsandbytes packaging for ROCm (#1299)

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 .github/scripts/build-rocm.sh        | 12 +++++++-----
 .github/workflows/python-package.yml | 10 ++++++++++
 CMakeLists.txt                       |  6 ++++--
 bitsandbytes/cextension.py           |  6 ++++--
 csrc/kernels.hip                     |  2 --
 csrc/ops_hip.cuh                     |  6 ------
 docs/source/installation.mdx         | 20 +++++++++++++++-----
 tests/test_functional.py             |  3 +++
 8 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index fc7515aa7..b508fac69 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -1,19 +1,21 @@
 #!/bin/bash
 declare build_arch
 declare build_os
+declare rocm_version
 
 set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:6.1-complete
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
 	docker run --rm --platform "linux/$build_arch" -i \
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 
-#output_dir="output/${build_os}/${build_arch}"
-#mkdir -p "${output_dir}"
-#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 671dfee1c..91e6d82a6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -106,6 +106,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -123,10 +125,18 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ec48b9d97..eac72fe52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()
@@ -194,12 +194,14 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 03d2cbd61..cfeaf4f44 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index ca77dceda..d8d7cdba5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 1b9c13063..e57cbb3b5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index f917f2623..0e8da0cda 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -146,15 +146,25 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 > [!TIP]
-> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
 
 ```bash
-# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
-docker pull rocm/pytorch:latest
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
 
+# Install pytorch compatible with above ROCm version
+pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4e82c530a..a9d926b89 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)

From bb438579d307f5758575d165cbac5edb77bf6432 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:00:55 +0000
Subject: [PATCH 177/234] add bnb attribute to expose supported devices

---
 bitsandbytes/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index e4b133476..1e638eb79 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,6 +21,11 @@
 from .nn import modules
 
 features = {"multi_backend"}
+supported_torch_devices = {
+    "cuda",  # includes ROCm
+    "xpu",  # Intel GPU
+    "cpu",
+}
 
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())

From 18668d29af977ca0f616a5a16bdbce53604553a7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 21 Aug 2024 02:29:16 +0800
Subject: [PATCH 178/234] fix 4bit dtype (#1325)

* fix 4bit dtype

* fix nf4 save
---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 bitsandbytes/nn/modules.py              | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index e35535ddb..0fcfffa07 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -387,7 +387,7 @@ def quantize_4bit_impl(
             -1,  # act_quant_mode. -1 means don't quant activation
         )
         state.absmax = torch.Tensor()
-        return torch.Tensor(), state
+        return torch.empty([1, 0], dtype=torch.uint8), state
 
     return out.unsqueeze(0), state
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 85ce52cc5..2348d0791 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -449,6 +449,10 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         if getattr(self.weight, "quant_state", None) is not None:
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
+            if getattr(self.weight.quant_state, "op_context", None) is not None:
+                context = self.weight.quant_state.op_context
+                destination[prefix + "weight." + "absmax"] = context.get_scales().reshape(-1)
+                self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
 
     def forward(self, x: torch.Tensor):
         # weights are cast automatically as Int8Params, but the bias has to be cast manually

From 2bfa3472ecde8f3e4a0306b017826314c288b7c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 26 Aug 2024 19:27:12 +0000
Subject: [PATCH 179/234] docs: tweaks for multi-backend preview release prep

---
 docs/source/installation.mdx | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 0e8da0cda..60419b38a 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -134,14 +134,23 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend preview release compilation[[multi-backend]]
+## Multi-backend[[multi-backend]]
+
+> [!TIP]
+> This functionality is currently in preview and therefore not yet production-ready!
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
+### Pip install the pre-built wheel (recommended for most)
+
+WIP (will be added in the coming days)
+
+### Compilation
+
 <hfoptions id="backend">
 <hfoption id="AMD ROCm">
 
-### AMD GPU
+#### AMD GPU
 
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
@@ -179,7 +188,7 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 </hfoption>
 <hfoption id="Intel CPU + GPU">
 
-### Intel CPU
+#### Intel CPU
 
 > [!TIP]
 > Intel CPU backend only supports building from source; for now, please follow the instructions below.
@@ -200,6 +209,8 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 
+#### Apple Silicon
+
 WIP
 
 </hfoption>

From c8383fbf65cee2bc61f7421dc9b57ad9e9447c1e Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:00:34 +0000
Subject: [PATCH 180/234] docs: get started on detailed multi-backend guide

---
 docs/source/_toctree.yml          |  2 ++
 docs/source/non_cuda_backends.mdx | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 docs/source/non_cuda_backends.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index fdfe19ee4..a72eb1967 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -12,6 +12,8 @@
     title: 8-bit optimizers
   - local: algorithms
     title: Algorithms
+  - local: non_cuda_backends
+    title: Non-CUDA compute backends
   - local: fsdp_qlora
     title: FSDP-QLoRA
   - local: integrations
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
new file mode 100644
index 000000000..fca586534
--- /dev/null
+++ b/docs/source/non_cuda_backends.mdx
@@ -0,0 +1,27 @@
+# Multi-backend support (non-CUDA backends)
+
+As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
+
+At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.
+
+Please refer to the [installation instructions](./installation#multi-backend) for details on installing the backend you intend to test (and hopefully provide feedback on).
+
+> [!Tip]
+> Apple Silicon support is planned for Q4 2024. We are actively seeking contributors to help implement this, develop a concrete plan, and create a detailed list of requirements. Due to limited resources, we rely on community contributions for this implementation effort. To discuss further, please spell out your thoughts and discuss in [this GitHub discussion](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and tag `@Titus-von-Koeller` and `@matthewdouglas`. Thank you!
+
+## Alpha Release
+
+As we are currently in the alpha testing phase, bugs are expected, and performance might not meet expectations. However, this is exactly what we want to discover from **your** perspective as the end user!
+
+Please share and discuss your feedback with us here:
+
+- [Github Discussion: Multi-backend refactor: Alpha release ( AMD ROCm ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
+- [Github Discussion: Multi-backend refactor: Alpha release ( Intel ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
+
+Thank you for your support!
+
+## Benchmarks
+
+### Intel
+
+### AMD

From 3b94d626fdcde73b32586995828d68010668bedd Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 30 Aug 2024 01:25:43 +0800
Subject: [PATCH 181/234] rm warn for multi backend (#1336)

---
 bitsandbytes/cextension.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index cfeaf4f44..6c18275c6 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -106,10 +106,6 @@ def get_native_library() -> BNBNativeLibrary:
     if hasattr(dll, "get_context"):  # only a CUDA-built library exposes this
         return CudaBNBNativeLibrary(dll)
 
-    logger.warning(
-        "The installed version of bitsandbytes was compiled without GPU support. "
-        "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.",
-    )
     return BNBNativeLibrary(dll)
 
 

From 39097a6fae9951630e83baa7b6a34f569d91f1a9 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 30 Aug 2024 10:18:48 +0000
Subject: [PATCH 182/234] actions: update permissions for pr docs publishing

---
 .github/workflows/upload_pr_documentation.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
index 6497caf2d..707705297 100644
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -6,6 +6,10 @@ on:
     types:
       - completed
 
+permissions:
+  contents: read
+  pull-requests: write # Allows posting comments on pull requests
+
 jobs:
   build:
     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main

From 27846533d19eed5c6ef3cb01e8ee237639069180 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:49:28 +0800
Subject: [PATCH 183/234] fix nf4 memory issue by init op_context in forward
 (#1349)

* fix nf4 memory issue by init op_context in forward

* disable repack in init

* fix code style
---
 bitsandbytes/backends/cpu_xpu_common.py | 19 -----------------
 bitsandbytes/nn/modules.py              | 27 +++++++++++++++++++++----
 bitsandbytes/utils.py                   | 24 ++++++++++++++++++++++
 3 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 0fcfffa07..0d865b541 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -370,25 +370,6 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and input_shape[1] % blocksize == 0 and quant_type == "nf4":
-        # lowp_mode: lowest precision for computation
-        lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
-        state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
-            out.reshape([input_shape[0], input_shape[1] // 2]),
-            ipex_cpu.quantization.WoqWeightDtype.NF4,
-            input_shape,  # weight shape
-            absmax.view(input_shape[0], input_shape[1] // blocksize),  # scales
-            None,  # zero_points
-            None,  # bias
-            None,  # g_idx
-            None,  # batch_size
-            blocksize,
-            int(lowp_mode),
-            -1,  # act_quant_mode. -1 means don't quant activation
-        )
-        state.absmax = torch.Tensor()
-        return torch.empty([1, 0], dtype=torch.uint8), state
-
     return out.unsqueeze(0), state
 
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2348d0791..ad424a6f4 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -19,6 +19,7 @@
     INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
     LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
     OutlierTracer,
+    enable_ipex_fusion,
 )
 
 T = TypeVar("T", bound="torch.nn.Module")
@@ -444,17 +445,35 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         save weight and bias,
         then fill state_dict with components of quant_state
         """
+        if (
+            getattr(self.weight, "quant_state", None) is not None
+            and getattr(self.weight.quant_state, "op_context", None) is not None
+        ):
+            context = self.weight.quant_state.op_context
+            self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
+
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
         if getattr(self.weight, "quant_state", None) is not None:
+            if (
+                self.weight.quant_state.absmax.shape.numel() == 0
+                and getattr(self.weight.quant_state, "op_context", None) is not None
+            ):
+                self.weight.quant_state.absmax = context.get_scales().reshape(-1)
+                delattr(self.weight.quant_state, "op_context")
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
-            if getattr(self.weight.quant_state, "op_context", None) is not None:
-                context = self.weight.quant_state.op_context
-                destination[prefix + "weight." + "absmax"] = context.get_scales().reshape(-1)
-                self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
 
     def forward(self, x: torch.Tensor):
+        # Check if ipex fusion can be used
+        if (
+            x.device.type == "cpu"
+            and not hasattr(self.weight.quant_state, "op_context")
+            and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
+            and self.weight.quant_state.quant_type == "nf4"
+        ):
+            enable_ipex_fusion(self.weight, self.weight.quant_state)
+
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
             self.bias.data = self.bias.data.to(x.dtype)
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index fa9a7eb70..9e52c915d 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,6 +200,30 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
+def enable_ipex_fusion(weight, quant_state):
+    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
+
+    if _ipex_cpu_version_prereq(2, 3):
+        import intel_extension_for_pytorch as ipex
+
+        lowp_mode = ipex.quantization.WoqLowpMode.BF16
+        quant_state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
+            weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            ipex.quantization.WoqWeightDtype.NF4,
+            quant_state.shape,  # weight shape
+            quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
+            None,  # zero_points
+            None,  # bias
+            None,  # g_idx
+            None,  # batch_size
+            quant_state.blocksize,
+            int(lowp_mode),
+            -1,  # act_quant_mode. -1 means don't quant activation
+        )
+        quant_state.absmax = torch.Tensor()
+        weight.data = torch.empty([1, 0], dtype=torch.uint8)
+
+
 class QuantState:
     """container for quantization state components to work with Params4bit and similar classes"""
 

From 45b7d14a9ae58927688c04dde6a8d70275abd0ae Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:45:43 -0500
Subject: [PATCH 184/234] AMD: Clarify diagnostic messages; free up disk space
 for CI build

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 .github/workflows/python-package.yml | 21 +++++--
 bitsandbytes/cextension.py           | 11 ++--
 bitsandbytes/diagnostics/cuda.py     | 89 ++++++++++++++++++++++++----
 bitsandbytes/diagnostics/main.py     | 31 ++++++----
 csrc/ops.hip                         | 26 +++++---
 5 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 91e6d82a6..d2da82501 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -116,10 +116,23 @@ jobs:
         uses: docker/setup-qemu-action@v2
       - name: Clean up disk space
         run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          sudo rm -rf \
+              /usr/share/dotnet \
+              /opt/ghc \
+              "/usr/local/share/boost" \
+              "$AGENT_TOOLSDIRECTORY" \
+              /opt/hostedtoolcache \
+              /opt/google/chrome \
+              /opt/microsoft/msedge \
+              /opt/microsoft/powershell \
+              /opt/pipx \
+              /usr/lib/mono \
+              /usr/local/julia* \
+              /usr/local/lib/android \
+              /usr/local/lib/node_modules \
+              /usr/local/share/chromium \
+              /usr/local/share/powershell \
+              /usr/share/swift
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 6c18275c6..cc5d8deff 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -99,7 +99,7 @@ def get_native_library() -> BNBNativeLibrary:
         if cuda_binary_path.exists():
             binary_path = cuda_binary_path
         else:
-            logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path)
+            logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
     logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
     dll = ct.cdll.LoadLibrary(str(binary_path))
 
@@ -116,21 +116,24 @@ def get_native_library() -> BNBNativeLibrary:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
         BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
+        BNB_BACKEND = "ROCm"
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
         BNB_HIP_VERSION_SHORT = ""
+        BNB_BACKEND = "CUDA"
+
     lib = get_native_library()
 except Exception as e:
     lib = None
     logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
     if torch.cuda.is_available():
         logger.warning(
-            """
-CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
+            f"""
+{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
 
 python -m bitsandbytes
 
-Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
+Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
 to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
 and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues
 """,
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index 8974c6400..014b753a9 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from bitsandbytes.cextension import get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.consts import NONPYTORCH_DOC_URL
 from bitsandbytes.cuda_specs import CUDASpecs
 from bitsandbytes.diagnostics.utils import print_dedented
@@ -32,15 +32,20 @@
     "_",  # current Python interpreter
 }
 
-CUDA_RUNTIME_LIB_PATTERNS = (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
-)
-
 logger = logging.getLogger(__name__)
 
 
+def get_runtime_lib_patterns() -> tuple:
+    if HIP_ENVIRONMENT:
+        return ("libamdhip64.so*",)
+    else:
+        return (
+            "cudart64*.dll",  # Windows
+            "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+            "nvcuda*.dll",  # Windows
+        )
+
+
 def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
     for dir_string in paths_list_candidate.split(os.pathsep):
         if not dir_string:
@@ -55,9 +60,9 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
                     continue
             except OSError:  # Assume an esoteric error trying to poke at the directory
                 pass
-            for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
+            for lib_pattern in get_runtime_lib_patterns():
                 for pth in dir.glob(lib_pattern):
-                    if pth.is_file():
+                    if pth.is_file() and not pth.is_symlink():
                         yield pth
         except (OSError, PermissionError):
             pass
@@ -104,7 +109,7 @@ def find_cudart_libraries() -> Iterator[Path]:
         yield from find_cuda_libraries_in_path_list(value)
 
 
-def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
+def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
     print(
         f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
         f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
@@ -149,10 +154,40 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
     # (2) Multiple CUDA versions installed
 
 
-def print_cuda_runtime_diagnostics() -> None:
+def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+
+    binary_path = get_cuda_bnb_library_path(cuda_specs)
+    if not binary_path.exists():
+        print_dedented(
+            f"""
+        Library not found: {binary_path}.
+        Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
+        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+        and rebuild bitsandbytes.
+        """,
+        )
+
+    hip_major, hip_minor = cuda_specs.cuda_version_tuple
+    if (hip_major, hip_minor) < (6, 1):
+        print_dedented(
+            """
+            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+            """,
+        )
+
+
+def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_diagnostics(cuda_specs)
+    else:
+        _print_cuda_diagnostics(cuda_specs)
+
+
+def _print_cuda_runtime_diagnostics() -> None:
     cudart_paths = list(find_cudart_libraries())
     if not cudart_paths:
-        print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
+        print("WARNING! CUDA runtime files not found in any environmental path.")
     elif len(cudart_paths) > 1:
         print_dedented(
             f"""
@@ -174,3 +209,33 @@ def print_cuda_runtime_diagnostics() -> None:
         )
         for pth in cudart_paths:
             print(f"* Found CUDA runtime at: {pth}")
+
+
+def _print_hip_runtime_diagnostics() -> None:
+    cudart_paths = list(find_cudart_libraries())
+    if not cudart_paths:
+        print("WARNING! ROCm runtime files not found in any environmental path.")
+    elif len(cudart_paths) > 1:
+        print_dedented(
+            f"""
+            Found duplicate ROCm runtime files (see below).
+
+            We select the PyTorch default ROCm runtime, which is {torch.version.hip},
+            but this might mismatch with the ROCm version that is needed for bitsandbytes.
+
+            To resolve it, install PyTorch built for the ROCm version you want to use
+
+            and set LD_LIBRARY_PATH to your ROCm install path, e.g.
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
+            """,
+        )
+
+        for pth in cudart_paths:
+            print(f"* Found ROCm runtime at: {pth}")
+
+
+def print_runtime_diagnostics() -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_runtime_diagnostics()
+    else:
+        _print_cuda_runtime_diagnostics()
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 1ce096f69..8dc43ed2a 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -3,11 +3,12 @@
 
 import torch
 
+from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
 from bitsandbytes.consts import PACKAGE_GITHUB_URL
 from bitsandbytes.cuda_specs import get_cuda_specs
 from bitsandbytes.diagnostics.cuda import (
-    print_cuda_diagnostics,
-    print_cuda_runtime_diagnostics,
+    print_diagnostics,
+    print_runtime_diagnostics,
 )
 from bitsandbytes.diagnostics.utils import print_dedented, print_header
 
@@ -16,12 +17,13 @@ def sanity_check():
     from bitsandbytes.cextension import lib
 
     if lib is None:
+        compute_backend = "cuda" if not HIP_ENVIRONMENT else "hip"
         print_dedented(
-            """
+            f"""
             Couldn't load the bitsandbytes library, likely due to missing binaries.
             Please ensure bitsandbytes is properly installed.
 
-            For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`.
+            For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND={compute_backend} -S .`.
             See the documentation for more details if needed.
 
             Trying a simple check anyway, but this will likely fail...
@@ -49,19 +51,24 @@ def main():
 
     print_header("OTHER")
     cuda_specs = get_cuda_specs()
-    print("CUDA specs:", cuda_specs)
+    if HIP_ENVIRONMENT:
+        rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
+        rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
+        print(f"{BNB_BACKEND} specs:{rocm_specs}")
+    else:
+        print(f"{BNB_BACKEND} specs:{cuda_specs}")
     if not torch.cuda.is_available():
-        print("Torch says CUDA is not available. Possible reasons:")
-        print("1. CUDA driver not installed")
-        print("2. CUDA not installed")
-        print("3. You have multiple conflicting CUDA libraries")
+        print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
+        print(f"1. {BNB_BACKEND} driver not installed")
+        print(f"2. {BNB_BACKEND} not installed")
+        print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
     if cuda_specs:
-        print_cuda_diagnostics(cuda_specs)
-    print_cuda_runtime_diagnostics()
+        print_diagnostics(cuda_specs)
+    print_runtime_diagnostics()
     print_header("")
     print_header("DEBUG INFO END")
     print_header("")
-    print("Checking that the library is importable and CUDA is callable...")
+    print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
     try:
         sanity_check()
         print("SUCCESS!")
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 157e84629..4fdc3cbfa 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -576,6 +576,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       if (returnedAlgoCount == 0)
       {
         has_error = 1;
+        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
       }
       else
       {
@@ -614,18 +615,25 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
                                                             heuristicResult,
                                                             &returnedAlgoCount));
 
-      if(!SCALE_ROWS)
+      if (returnedAlgoCount == 0)
       {
-        float alpha = 1.0f, beta = 0.0f;
-
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        has_error = 1;
+        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
       }
       else
       {
-        //has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
-	float beta = 0.0f;
-
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        if(!SCALE_ROWS)
+        {
+          float alpha = 1.0f, beta = 0.0f;
+
+          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        }
+        else
+        {
+          float beta = 0.0f;
+
+          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
+        }
       }
     }
 
@@ -635,7 +643,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
     if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
     if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
     if(has_error == 1)
-      printf("error detected");
+      fprintf(stderr, "error detected\n");
 
     return has_error;
 #endif // NO_HIPBLASLT

From a23984fed5f87f24348d2e8f10e8792853d5eaed Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 20 Sep 2024 07:35:40 +0800
Subject: [PATCH 185/234] check grad before using ipex (#1358)

---
 bitsandbytes/nn/modules.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad424a6f4..32854413f 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -471,6 +471,7 @@ def forward(self, x: torch.Tensor):
             and not hasattr(self.weight.quant_state, "op_context")
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and x.requires_grad == False
         ):
             enable_ipex_fusion(self.weight, self.weight.quant_state)
 

From e8881bef17a4666ac5fee65a73bf337cdc8ca547 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 20 Sep 2024 15:54:58 -0500
Subject: [PATCH 186/234] Enable packaging for ROCm 6.2 (#1367)

* Enable 6.2 build

* Update documentation for 6.2.0 pip install
---
 .github/workflows/python-package.yml | 2 +-
 docs/source/installation.mdx         | 2 +-
 tests/test_functional.py             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index d2da82501..21c4c1895 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -107,7 +107,7 @@ jobs:
         os: [ubuntu-latest]
         arch: [x86_64]
         rocm_version:
-          ["6.1.2"]
+          ["6.1.2", "6.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 60419b38a..146fb0ddd 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -167,7 +167,7 @@ apt-get update && apt-get install -y git && cd home
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
 
 # Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
 # Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
diff --git a/tests/test_functional.py b/tests/test_functional.py
index a9d926b89..35187db78 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2303,6 +2303,7 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)

From 0d3d977c8f9fab7193345a4dc8f2e19c9bb35db3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 9 Sep 2024 14:31:38 -0400
Subject: [PATCH 187/234] Update for VS2022 17.11 compatibility with CUDA <
 12.4 (#1341)

* Update for VS2022 17.11 compatibility with CUDA < 12.4

* Try again
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eac72fe52..315e0ff1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,11 @@ if(BUILD_CUDA)
     # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
     if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
         string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+
+        # This is needed to build with VS2022 17.11+ and CUDA < 12.4.
+        if (MSVC_VERSION VERSION_GREATER_EQUAL 1941)
+            string(APPEND CMAKE_CUDA_FLAGS " -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH")
+        endif()
     endif()
 
     enable_language(CUDA) # This will fail if CUDA is not found

From e72637c99cd314a0b840615754fb4e433875b550 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 09:45:42 -0400
Subject: [PATCH 188/234] Enable continuous releases for multi-backend-refactor
 branch

---
 .github/workflows/python-package.yml | 50 ++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 21c4c1895..3aeeef9ba 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -197,6 +197,56 @@ jobs:
           path: dist/bitsandbytes-*.whl
           retention-days: 7
 
+  upload-pre-release-wheels:
+    name: Create release and upload artifacts
+    runs-on: ubuntu-latest
+    if: github.ref_name == 'multi-backend-refactor'
+    permissions:
+      contents: write
+    needs:
+      - build-wheels
+    steps:
+      - name: Download artifacts to tmp directory
+        uses: actions/download-artifact@v4
+        with:
+          path: tmp/
+          pattern: "bdist_wheel_*"
+          merge-multiple: true
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
+      - name: Move and rename wheel files
+        run: |
+          mkdir -p wheels/
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+            if [[ $wheel_filename == *linux*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-linux-x86_64.whl
+            elif [[ $wheel_filename == *linux*aarch64* ]]; then
+              mv "$wheel" wheels/bnb-linux-aarch64.whl
+            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-macos-x86_64.whl
+            elif [[ $wheel_filename == *macosx*arm64* ]]; then
+              mv "$wheel" wheels/bnb-macos-arm64.whl
+            elif [[ $wheel_filename == *win*amd64* ]]; then
+              mv "$wheel" wheels/bnb-windows-x86_64.whl
+            else
+              echo "Unknown wheel format: $wheel_filename"
+              exit 1
+            fi
+          done
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
+      - name: Create release and upload artifacts
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_CONTINUOUS_RELEASE_NAME: Multi-Backend Preview
+          GITHUB_CONTINUOUS_RELEASE_TYPE: prerelease
+          GITHUB_CONTINUOUS_RELEASE_TAG: continuous-release_preview
+        run: |
+          wget -q https://github.com/TheAssassin/pyuploadtool/releases/download/continuous/pyuploadtool-x86_64.AppImage
+          chmod +x pyuploadtool-x86_64.AppImage
+          ./pyuploadtool-x86_64.AppImage --appimage-extract-and-run wheels/*.whl
+
   audit-wheels:
     needs: build-wheels
     runs-on: ubuntu-latest

From 662dc6057ad95207fe27fdd3925dd5c4094a8488 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:01:34 -0400
Subject: [PATCH 189/234] Update release workflow

---
 .github/workflows/python-package.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3aeeef9ba..77316967d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -237,15 +237,15 @@ jobs:
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
       - name: Create release and upload artifacts
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_CONTINUOUS_RELEASE_NAME: Multi-Backend Preview
-          GITHUB_CONTINUOUS_RELEASE_TYPE: prerelease
-          GITHUB_CONTINUOUS_RELEASE_TAG: continuous-release_preview
-        run: |
-          wget -q https://github.com/TheAssassin/pyuploadtool/releases/download/continuous/pyuploadtool-x86_64.AppImage
-          chmod +x pyuploadtool-x86_64.AppImage
-          ./pyuploadtool-x86_64.AppImage --appimage-extract-and-run wheels/*.whl
+        uses: softprops/action-gh-release@v2.0.8
+        with:
+          files: wheels/*.whl
+          prerelease: true
+          name: Multi-Backend Preview
+          tag_name: continuous-release-preview
+          make_latest: false
+          draft: true
+          target_commitish: ${{ github.ref_name }}
 
   audit-wheels:
     needs: build-wheels

From 3227cdd366770c1e7b40eff3bf43dbbe012b6a9e Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:20:00 -0400
Subject: [PATCH 190/234] Publish continuous release for multi-backend

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 77316967d..37e52be6c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -244,7 +244,7 @@ jobs:
           name: Multi-Backend Preview
           tag_name: continuous-release-preview
           make_latest: false
-          draft: true
+          draft: false
           target_commitish: ${{ github.ref_name }}
 
   audit-wheels:

From 0a2b5392ff079645fdc9ff887f80d327f9e874f7 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:09:59 +0000
Subject: [PATCH 191/234] continuous release: revert wheel renaming due to
 install err

---
 .github/workflows/python-package.yml | 32 +++++-----------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 37e52be6c..42d3d0957 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,40 +206,18 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts to tmp directory
+      - name: Download artifacts
         uses: actions/download-artifact@v4
         with:
-          path: tmp/
+          path: artifacts/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-      - name: Inspect tmp directory after downloading artifacts
-        run: ls -alFR tmp/
-      - name: Move and rename wheel files
-        run: |
-          mkdir -p wheels/
-          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
-            wheel_filename=$(basename "$wheel")
-            if [[ $wheel_filename == *linux*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-linux-x86_64.whl
-            elif [[ $wheel_filename == *linux*aarch64* ]]; then
-              mv "$wheel" wheels/bnb-linux-aarch64.whl
-            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-macos-x86_64.whl
-            elif [[ $wheel_filename == *macosx*arm64* ]]; then
-              mv "$wheel" wheels/bnb-macos-arm64.whl
-            elif [[ $wheel_filename == *win*amd64* ]]; then
-              mv "$wheel" wheels/bnb-windows-x86_64.whl
-            else
-              echo "Unknown wheel format: $wheel_filename"
-              exit 1
-            fi
-          done
-      - name: Inspect wheels directory after renaming files
-        run: ls -alFR wheels/
+      - name: Inspect artifacts directory after downloading
+        run: ls -alFR artifacts/
       - name: Create release and upload artifacts
         uses: softprops/action-gh-release@v2.0.8
         with:
-          files: wheels/*.whl
+          files: artifacts/**/*.whl
           prerelease: true
           name: Multi-Backend Preview
           tag_name: continuous-release-preview

From 8c5499e7498112fbdf172d2cba0d92a505ecef44 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 17:38:12 +0000
Subject: [PATCH 192/234] Revert "continuous release: revert wheel renaming due
 to install err"

This reverts commit 0a2b5392ff079645fdc9ff887f80d327f9e874f7.
---
 .github/workflows/python-package.yml | 32 +++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 42d3d0957..37e52be6c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,18 +206,40 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts
+      - name: Download artifacts to tmp directory
         uses: actions/download-artifact@v4
         with:
-          path: artifacts/
+          path: tmp/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-      - name: Inspect artifacts directory after downloading
-        run: ls -alFR artifacts/
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
+      - name: Move and rename wheel files
+        run: |
+          mkdir -p wheels/
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+            if [[ $wheel_filename == *linux*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-linux-x86_64.whl
+            elif [[ $wheel_filename == *linux*aarch64* ]]; then
+              mv "$wheel" wheels/bnb-linux-aarch64.whl
+            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
+              mv "$wheel" wheels/bnb-macos-x86_64.whl
+            elif [[ $wheel_filename == *macosx*arm64* ]]; then
+              mv "$wheel" wheels/bnb-macos-arm64.whl
+            elif [[ $wheel_filename == *win*amd64* ]]; then
+              mv "$wheel" wheels/bnb-windows-x86_64.whl
+            else
+              echo "Unknown wheel format: $wheel_filename"
+              exit 1
+            fi
+          done
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
       - name: Create release and upload artifacts
         uses: softprops/action-gh-release@v2.0.8
         with:
-          files: artifacts/**/*.whl
+          files: wheels/*.whl
           prerelease: true
           name: Multi-Backend Preview
           tag_name: continuous-release-preview

From 02d5b423a56908e22edfe3a044de251de13dd231 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:08:29 +0000
Subject: [PATCH 193/234] add dynamic tag-based versioning + git hash for dev
 vers

---
 .github/workflows/python-package.yml | 21 +++++-------------
 .gitignore                           |  2 ++
 bitsandbytes/__init__.py             |  5 +++--
 setup.py                             | 32 +++++++++++++++++++++++++++-
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 37e52be6c..f655df4f9 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -206,7 +206,7 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts to tmp directory
+      - name: Download and rename artifacts
         uses: actions/download-artifact@v4
         with:
           path: tmp/
@@ -214,25 +214,14 @@ jobs:
           merge-multiple: true
       - name: Inspect tmp directory after downloading artifacts
         run: ls -alFR tmp/
-      - name: Move and rename wheel files
+      - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
           find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
             wheel_filename=$(basename "$wheel")
-            if [[ $wheel_filename == *linux*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-linux-x86_64.whl
-            elif [[ $wheel_filename == *linux*aarch64* ]]; then
-              mv "$wheel" wheels/bnb-linux-aarch64.whl
-            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-macos-x86_64.whl
-            elif [[ $wheel_filename == *macosx*arm64* ]]; then
-              mv "$wheel" wheels/bnb-macos-arm64.whl
-            elif [[ $wheel_filename == *win*amd64* ]]; then
-              mv "$wheel" wheels/bnb-windows-x86_64.whl
-            else
-              echo "Unknown wheel format: $wheel_filename"
-              exit 1
-            fi
+            # Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
+            cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
+            mv "$wheel" "wheels/$cleaned_filename"
           done
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
diff --git a/.gitignore b/.gitignore
index 22f5a6cd6..cd1b797bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,8 @@ dmypy.json
 # vim
 *.swp
 
+# BNB-specific stuff
 dependencies
 cuda_build
 output/
+bitsandbytes/_version.py
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 1e638eb79..25ec8a79a 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Import the dynamically generated version from _version.py  (see setup.py)
+from ._version import __version__  # isort: skip # type: ignore
+
 import torch
 
 from . import research, utils
@@ -73,5 +76,3 @@
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
-
-__version__ = "0.43.3.dev"
diff --git a/setup.py b/setup.py
index 18de0fe5b..2b1c1aff3 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 import glob
 import os
+import subprocess
 
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
@@ -13,6 +14,35 @@
 print("libs:", libs)
 
 
+def get_git_commit_hash():
+    return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip()
+
+
+def is_git_tagged_commit():
+    tags = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]).decode("utf-8").strip()
+    return bool(tags)
+
+
+def get_latest_semver_tag():
+    tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
+    semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
+    if not semver_tags:
+        raise ValueError("No valid semantic version tags found")
+    return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
+
+
+def write_version_file(version, filepath="bitsandbytes/_version.py"):
+    with open(filepath, "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+
+
+def get_version_and_write_to_file():
+    latest_semver_tag = get_latest_semver_tag()
+    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
+    write_version_file(version)
+    return version
+
+
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
@@ -25,7 +55,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.3.dev",
+    version=get_version_and_write_to_file(),
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 6927dcc493562cdec804ffc833627275686b3904 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:50:51 +0000
Subject: [PATCH 194/234] docs: update w/ changes from `main`

---
 docs/source/contributing.mdx      |  5 +++--
 docs/source/installation.mdx      |  2 +-
 docs/source/non_cuda_backends.mdx | 16 +++++++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
index 4fe6b7541..5da42961e 100644
--- a/docs/source/contributing.mdx
+++ b/docs/source/contributing.mdx
@@ -5,8 +5,9 @@
 
 ### Setup pre-commit hooks
 - Install pre-commit hooks with `pip install pre-commit`.
-- Run `pre-commit autoupdate` once to configure the hooks.
-- Re-run `pre-commit autoupdate` every time a new hook got added.
+- Run `pre-commit install` once to install the hooks, so they will be run on every commit.
+- If the hooks introduce changes, they'll be visible with `git diff`. Review them and `git add` them if everything is fine, then re-execute the before commit, it should pass now.
+- If you want to manually trigger the hooks, you may do `pre-commit run --all-files`
 
 Now all the pre-commit hooks will be automatically run when you try to commit and if they introduce some changes, you need to re-add the changed files before being able to commit and push.
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 146fb0ddd..2f82c199b 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -137,7 +137,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 ## Multi-backend[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready!
+> This functionality is currently in preview and therefore not yet production-ready! Please reference [this guide](./non_cuda_backends) for more in-depth information about the different backends and their current status.
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fca586534..fc7c6ac27 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -24,4 +24,18 @@ Thank you for your support!
 
 ### Intel
 
-### AMD
+The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+
+#### Inference (CPU)
+
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+
+#### Fine-Tuning (CPU)
+
+| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Memory (GB) | 40 | 9 | 6.6 | 6.6 |

From 8dcd971cc11ab3449eea01419ec1676d5d5e53c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 27 Sep 2024 20:24:03 +0000
Subject: [PATCH 195/234] get tags for dynamic versioning

---
 .github/workflows/python-package.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f655df4f9..9cd9ceb78 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -166,6 +166,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1  # shallow clone
+      - name: Fetch tags for dynamic versioning in setup.py
+        run: |
+          git fetch --depth=1 origin --tags
+          echo "Available Git tags:"
+          git tag -n
       - name: Download build artifact
         uses: actions/download-artifact@v4
         with:
@@ -183,7 +190,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: pip
       - run: pip install build wheel
-      - run: python -m build .
+      # for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
+      - run: python -m build --sdist && python -m build --wheel
       - name: Determine and Set Platform Tag, then Tag Wheel
         shell: bash
         run: |

From 09ac7ec34f556d74356167ed4214d9e1f3f98bad Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:34:53 +0000
Subject: [PATCH 196/234] fine-tune continuous release params

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9cd9ceb78..f96dd995e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -239,10 +239,10 @@ jobs:
           files: wheels/*.whl
           prerelease: true
           name: Multi-Backend Preview
-          tag_name: continuous-release-preview
+          tag_name: continuous-release_multi-backend-refactor
           make_latest: false
           draft: false
-          target_commitish: ${{ github.ref_name }}
+          target_commitish: ${{ github.sha }}
 
   audit-wheels:
     needs: build-wheels

From cc56a30e7d54e42328f0a995106828372acaebfe Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 23:10:12 +0000
Subject: [PATCH 197/234] reduce the pkg size + build times for the preview
 release

---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f96dd995e..6a2b3f63e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -58,6 +58,7 @@ jobs:
   # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
   ##
   build-shared-libs-cuda:
+    if: github.ref_name != 'multi-backend-refactor'
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
@@ -148,7 +149,7 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
-      - build-shared-libs-cuda
+      # - build-shared-libs-cuda  reduce the pkg size + build times for the preview release
       - build-shared-libs-rocm
     strategy:
       matrix:

From 5225ebea79305af8e02bf9368aa282bc62f9b195 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 17:49:11 -0600
Subject: [PATCH 198/234] refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs
---
 docs/source/installation.mdx      | 224 ++++++++++++++++++++++++------
 docs/source/non_cuda_backends.mdx |   3 +
 2 files changed, 184 insertions(+), 43 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f82c199b..2ac56e03f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,29 +1,45 @@
-# Installation
+# Installation Guide
 
-## CUDA
+Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like AMD ROCm, Intel, and Apple Silicon.
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+> [!TIP]
+> For a high-level overview of backend support and compatibility, see the [Multi-backend Support](#multi-backend) section.
 
-The latest version of bitsandbytes builds on:
+## Table of Contents
 
-| OS | CUDA | Compiler |
-|---|---|---|
-| Linux | 11.7 - 12.3 | GCC 11.4 |
-|  | 12.4+ | GCC 13.2 |
-| Windows | 11.7 - 12.4 | MSVC 19.38+ (VS2022 17.8.0+) |
+- [CUDA](#cuda)
+  - [Installation via PyPI](#cuda-pip)
+  - [Compile from Source](#cuda-compile)
+- [Multi-backend Support (Alpha Release)](#multi-backend)
+  - [Supported Backends](#multi-backend-supported-backends)
+  - [Pre-requisites](#multi-backend-pre-requisites)
+  - [Installation](#multi-backend-pip)
+  - [Compile from Source](#multi-backend-compile)
+- [PyTorch CUDA Versions](#pytorch-cuda-versions)
 
-> [!TIP]
-> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
+## CUDA[[cuda]]
 
-For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features.
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
-| **Feature** | **Hardware requirement** |
-|---|---|
-| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
-| 8-bit optimizers/quantization | NVIDIA Kepler (GTX 780 or newer) |
+### Supported CUDA Configurations[[cuda-pip]]
+
+The latest version of `bitsandbytes` builds on the following configurations:
+
+| **OS**      | **CUDA Version** | **Compiler**         |
+|-------------|------------------|----------------------|
+| **Linux**   | 11.7 - 12.3      | GCC 11.4             |
+|             | 12.4+            | GCC 13.2             |
+| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
+
+For Linux systems, ensure your hardware meets the following requirements:
+
+| **Feature**                     | **Hardware Requirement**                                           |
+|---------------------------------|--------------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
+| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
 
 > [!WARNING]
-> bitsandbytes >= 0.39.1 no longer includes Kepler binaries in pip installations. This requires manual compilation, and you should follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation.
+> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
 
 To install from PyPI.
 
@@ -31,14 +47,41 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source[[compile]]
+### `pip install` pre-built wheel from latest `main` commit
+
+If you would like to use new feature even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
+
+<hfoptions id="OS">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+</hfoption>
+</hfoptions>
+
+### Compile from source[[cuda-compile]]
+
+> [!TIP]
+> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA versions or other less common configurations, which we don't support out of the box due to package size.
 
-For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
+For Linux and Windows systems, compiling from source allows you to customize the build configurations. See below for detailed platform-specific instructions (see the `CMakeLists.txt` if you want to check the specifics and explore some additional options):
 
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+
+For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
@@ -48,16 +91,16 @@ You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
-| CUDA Toolkit | GCC |
-|---|---|
-| >= 11.4.1 | >= 11 |
-| >= 12.0 | >= 12 |
-| >= 12.4 | >= 13 |
+| CUDA Toolkit |  GCC  |
+|--------------|-------|
+| >= 11.4.1    | >= 11 |
+| >= 12.0      | >= 12 |
+| >= 12.4      | >= 13 |
 
 Now to install the bitsandbytes package from source, run the following commands:
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
@@ -81,7 +124,7 @@ Refer to the following table if you're using another CUDA Toolkit version.
 | >= 11.6 | 19.30+ (VS2022) |
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
@@ -93,7 +136,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-### PyTorch CUDA versions
+### PyTorch CUDA versions[[pytorch-cuda-versions]]
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +148,7 @@ Some bitsandbytes features may need a newer CUDA version than the one currently
 Then locally install the CUDA version you need with this script from bitsandbytes:
 
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
+wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
@@ -134,28 +177,62 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend[[multi-backend]]
+## Multi-backend Support (Alpha Release)[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready! Please reference [this guide](./non_cuda_backends) for more in-depth information about the different backends and their current status.
+> This functionality is currently in preview and not yet production-ready. We very much welcome community feedback, contributions and leadership on topics like Apple Silicon as well as other less common accellerators! For more information, see [this guide on multi-backend support](./non_cuda_backends).
+
+**Link to give us feedback** (bugs, install issues, perf results, requests, etc.)**:**
+
+<hfoptions id="platform">
+<hfoption id="ROCm">
+
+[**Multi-backend refactor: Alpha release (AMD ROCm ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
+
+</hfoption>
+<hfoption id="Intel CPU+GPU">
+
+[**Multi-backend refactor: Alpha release (INTEL ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
+
+</hfoption>
+<hfoption id="Apple Silicon / Metal (MPS)">
 
-Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+[**Github Discussion space on coordinating the kickoff of MPS backend development**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340)
 
-### Pip install the pre-built wheel (recommended for most)
+</hfoption>
+</hfoptions>
 
-WIP (will be added in the coming days)
+### Supported Backends[[multi-backend-supported-backends]]
 
-### Compilation
+| **Backend** | **Supported Versions** | **Python versions** | **Architecture Support** | **Status** |
+|-------------|------------------------|---------------------------|-------------------------|------------|
+| **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
+| **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
+| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+
+For each supported backend, follow the respective instructions below:
+
+### Pre-requisites[[multi-backend-pre-requisites]]
+
+To use bitsandbytes non-CUDA backends, be sure to install:
+
+```
+pip install "transformers>=4.45.1"
+```
 
 <hfoptions id="backend">
 <hfoption id="AMD ROCm">
 
-#### AMD GPU
-
-bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+> [!WARNING]
+> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
+>
+> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
+>
+> **Windows is not supported for the ROCm backend**; also not WSL2 to our knowledge.
 
 > [!TIP]
-> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+> If you would like to install ROCm and PyTorch on bare metal, skip the Docker steps and refer to ROCm's official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Special note: please make sure to get the respective ROCm-specific PyTorch wheel for the installed ROCm version, e.g. `https://download.pytorch.org/whl/nightly/rocm6.2/`!
 
 ```bash
 # Create a docker container with latest ROCm image, which includes ROCm libraries
@@ -165,9 +242,70 @@ apt-get update && apt-get install -y git && cd home
 
 # Install pytorch compatible with above ROCm version
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+```
 
-# Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+> [!TIP]
+> Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
+
+</hfoption>
+</hfoptions>
+
+### Installation
+
+You can install the pre-built wheels for each backend, or compile from source for custom configurations.
+
+#### Pre-built Wheel Installation (recommended)[[multi-backend-pip]]
+
+<hfoptions id="platform">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
+```
+
+</hfoption>
+<hfoption id="Mac">
+
+> [!WARNING]
+> bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+
+</hfoption>
+</hfoptions>
+
+#### Compile from Source[[multi-backend-compile]]
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+#### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+```bash
 # Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
@@ -195,10 +333,10 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
 
-The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
 
 ```
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fc7c6ac27..728606b7b 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -1,5 +1,8 @@
 # Multi-backend support (non-CUDA backends)
 
+> [!Tip]
+> If you feel these docs need some additional info, please consider submitting a PR or respectfully request the missing info in one of the below mentioned Github discussion spaces.
+
 As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
 
 At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.

From e6cc10934c72f1ddc99944331da6a95673a605d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 1 Oct 2024 14:01:09 +0000
Subject: [PATCH 199/234] docs: remove 2 obsolete lines

---
 docs/source/installation.mdx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2ac56e03f..609865436 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -306,9 +306,6 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 ```bash
-# Please install from source if your configuration doesn't match with these)
-pip install bitsandbytes
-
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/

From cd3cb6812dd4f2579f04ff51efa0662cc0467c63 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:51:32 -0500
Subject: [PATCH 200/234] Remove depth option in installation steps (#1395)

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

* Enable 6.2 build

* Skip gemv 4 bit cpu test

* Update documentation for 6.2.0 pip install

* Update README for default branch change

* Fix typo

* Sync README with upstream

* Remove depth

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Aswin John Mathews <81309834+amathews-amd@users.noreply.github.com>
Co-authored-by: root <root@banff-cyxtera-s78-4.ctr.dcgpu>
---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 609865436..d1acb2cd6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -308,7 +308,7 @@ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha releas
 ```bash
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt

From cd73601fcb70f83f663b71c0169548facba3cd06 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Wed, 20 Nov 2024 20:24:45 +0800
Subject: [PATCH 201/234] Fix issue that no valid semantic version tag found
 when installing bitsandbytes from source in personal repo (#1419)

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2b1c1aff3..e8d3f547c 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,8 @@ def get_latest_semver_tag():
     tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
     semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
     if not semver_tags:
-        raise ValueError("No valid semantic version tags found")
+        print("No valid semantic version tags found, use 0.0.1 defaultly")
+        semver_tags = ["0.0.1"]
     return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
 
 

From b2ac4232999648bffb9c2a8b1a997ddd1029eadf Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 29 Nov 2024 22:48:17 +0800
Subject: [PATCH 202/234] Enable XPU and optimize cpu/xpu op (#1418)

* enable new ipex API

ipex weight is 4D so we cannot transpose

fix dequant

check require grad

* use ipex op in backward

* enable backward

* Multi backend refactor (#8)

* AMD: Clarify diagnostic messages; free up disk space for CI build

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* check grad before using ipex (#1358)

* Enable packaging for ROCm 6.2 (#1367)

* Enable 6.2 build

* Update documentation for 6.2.0 pip install

* Update for VS2022 17.11 compatibility with CUDA < 12.4 (#1341)

* Update for VS2022 17.11 compatibility with CUDA < 12.4

* Try again

* Enable continuous releases for multi-backend-refactor branch

* Update release workflow

* Publish continuous release for multi-backend

* continuous release: revert wheel renaming due to install err

* Revert "continuous release: revert wheel renaming due to install err"

This reverts commit 0a2b5392ff079645fdc9ff887f80d327f9e874f7.

* add dynamic tag-based versioning + git hash for dev vers

* docs: update w/ changes from `main`

* get tags for dynamic versioning

* fine-tune continuous release params

* reduce the pkg size + build times for the preview release

* refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: remove 2 obsolete lines

---------

Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>

* Revert "enable backward"

This reverts commit cd7bf2145807932c8a8a499ddb6bb14e47eb24fc.

* Revert "use ipex op in backward"

This reverts commit b8df1aad9414a669e188678b36be304400987a72.

* fix finetune

* check training

* fix gemv check

* reformat

* avoid double quant in backward if not needed

* Zh/xpu support (#9)

* Add xpu support

* Add xpu support for int8

* Add xpu dequant kernel support

* update code

* remove debug comments

* remove redundant comments

* Add xpu integration for woqlinear

* correct the comments

* Update cpu_xpu_common.py

---------

Co-authored-by: zhuhong61 <hong.zhu@intel.com>
Co-authored-by: zhuhong61 <95205772+zhuhong61@users.noreply.github.com>

* avoid import triton if CPU and XPU backend

* fix setup in docker without git config

* xpu do not support compile for now

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* update 4bit compute dtype

* fix xpu int8 path

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* optimize 4bit dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix xpu dequant

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* add empty cache in each xpu op

* add nf4 dequant ipex kernel

* fix dequant 4bit op

* empty cache has negative effect on 4bit gemv

* fix xpu save

* fix save

* xpu use float16 default

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* rm empty cache as it cause slower perf

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix xpu save

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 8bit int8 param device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

* update readme for Intel CPU and XPU do not need make csrc codes

* fix format

* fix import

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Co-authored-by: zhuhong61 <hong.zhu@intel.com>
Co-authored-by: zhuhong61 <95205772+zhuhong61@users.noreply.github.com>
---
 bitsandbytes/__init__.py                |  8 ++-
 bitsandbytes/autograd/_functions.py     | 17 +++--
 bitsandbytes/backends/cpu_xpu_common.py | 70 ++++++++++--------
 bitsandbytes/backends/xpu.py            | 95 ++++++++++++++++++++++---
 bitsandbytes/functional.py              | 22 +++---
 bitsandbytes/nn/__init__.py             | 16 +++--
 bitsandbytes/nn/modules.py              | 66 ++++++++++++-----
 bitsandbytes/utils.py                   | 41 +++++++----
 docs/source/installation.mdx            |  6 +-
 docs/source/non_cuda_backends.mdx       |  6 +-
 10 files changed, 246 insertions(+), 101 deletions(-)
 mode change 100644 => 100755 bitsandbytes/nn/modules.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 25ec8a79a..c705137c0 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -17,11 +17,10 @@
     matmul_cublas,
     mm_cublas,
 )
-from .backends import register_backend
+from .backends import backends, register_backend
 from .backends.cpu import CPUBackend
 from .backends.npu import NPUBackend
 from .cextension import lib
-from .nn import modules
 
 features = {"multi_backend"}
 supported_torch_devices = {
@@ -64,6 +63,11 @@
 if hasattr(torch, "npu") and torch.npu.is_available():
     register_backend("npu", NPUBackend())
 
+
+# import module after decided backends
+if backends:
+    from .nn import modules
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 59e26ad09..9765def05 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -221,7 +221,7 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    if device == torch.device("cpu"):
+    if device == torch.device("cpu") or torch.device("xpu"):
         return True
     if torch.version.hip:
         return False if BNB_HIP_VERSION < 601 else True
@@ -463,7 +463,9 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
+        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
+        if req_gradB or (req_gradA and state.CBt):
+            Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
             CxAt, SAt = F.transform(CAt, formatB, transpose=True)
             C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
@@ -575,8 +577,15 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
-        # CPU backend does not require A to be a vector
+    if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
+        if getattr(quant_state, "ipex", False):
+            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            if bias is not None:
+                out += bias
+            return out
+        else:
+            return MatMul4Bit.apply(A, B, out, bias, quant_state)
+    elif A.numel() == A.shape[-1] and A.requires_grad == False:
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 0d865b541..d2e0c2593 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -15,6 +15,7 @@
 
     ipex_cpu = ipex if ipex._C._has_cpu() else None
     ipex_xpu = ipex if ipex._C._has_xpu() else None
+    ipex_cpu_only = ipex._C._has_cpu() and (not ipex._C._has_xpu())
 except BaseException:
     ipex_cpu = None
     ipex_xpu = None
@@ -55,7 +56,7 @@ def _ipex_xpu_version_prereq(major, minor):
 
 def _maybe_torch_compile(func):
     # torch.compile requires g++ and pytorch >= 2.0
-    if gxx_available and _torch_version_prereq(2, 0):
+    if gxx_available and _torch_version_prereq(2, 0) and not ipex_xpu:
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):
@@ -181,7 +182,7 @@ def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32)
     A_reshaped = A.reshape(m, k)
 
     # torch._int_mm is available on CPU since torch 2.4
-    if _torch_version_prereq(2, 4):
+    if _torch_version_prereq(2, 4) and A.device.type == "cpu":
         C = torch._int_mm(A_reshaped, B.T).to(dtype)
     else:
         C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)
@@ -233,8 +234,10 @@ def mm_dequant_impl(
         out_shape = (out_shape[0] * out_shape[1], out_shape[2])
 
     if compute_dtype not in [torch.float32, torch.bfloat16]:
-        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use float instead")
-        compute_dtype = torch.float32
+        warnings.warn(
+            f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use bfloat16 instead"
+        )
+        compute_dtype = torch.bfloat16
     A_reshaped = A.reshape(out_shape).to(compute_dtype)
     row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
     col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
@@ -342,7 +345,7 @@ def quantize_4bit_impl(
         scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
         scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
     # map [-1, 1] to nf4/fp4
-    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8, device=A.device)
     if quant_type == "nf4":
         for i in range(len(NF4_QUANT_TABLE)):
             out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
@@ -408,7 +411,6 @@ def dequantize_4bit_impl(
     torch.Tensor:
         Dequantized tensor.
     """
-
     if A.shape[0] == 1:
         transpose = False
         A = A.squeeze(0)
@@ -438,23 +440,18 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
-        assert quant_state.op_context is not None
-        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
-        A = A.reshape(-1)
-        absmax = quant_state.op_context.get_scales().reshape(-1)
-
-    if out is None:
-        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
+        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        quant_state.ipex = False
 
-    n = out.numel()
     # Map nf4 to [-1, 1]
-    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
-    out_uint8[::2] = A.bitwise_and(0xF)
-    out_uint8[1::2] = A.bitwise_right_shift(4)
-    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
-    for i in range(len(quant_state.code)):
-        out_dq[out_uint8 == i] = quant_state.code[i]
+    out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
+    n = out_dq.numel()
+    out_dq[::2] = A & 0xF
+    out_dq[1::2] = A >> 4
+    # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
+    quant_state.code = quant_state.code.to(quant_state.dtype)
+    out_dq = quant_state.code[out_dq]
 
     # Apply scales
     if out_dq.numel() != n:
@@ -464,12 +461,17 @@ def dequantize_4bit_impl(
     blocks += 1 if n % blocksize > 0 else 0
     rem = n % blocksize
     has_rem = rem > 0
-    out_reshaped = out.reshape(-1)
-    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
-        -1
-    )
+
     if has_rem:
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+        out_reshaped = out.reshape(-1)
+        out_reshaped[: n - rem] = (
+            out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)
+        ).reshape(-1)
         out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
+    else:
+        out = (out_dq.view(-1, blocksize) * absmax.view(-1, 1)).reshape(quant_state.shape).to(quant_state.dtype)
 
     # take transpose here because weight is transposed (again) for computation
     if transpose:
@@ -510,9 +512,21 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
-        assert state.op_context is not None
-        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    if getattr(state, "ipex", False):
+        output = torch.ops.torch_ipex.woq_linear(
+            A,
+            B,
+            "nf4",
+            state.shape,
+            state.new_scales,
+            state.new_zeros,
+            None,
+            None,
+            state.blocksize,
+            ipex_cpu.quantization.WoqLowpMode.BF16,
+            1,
+            state.compensation,
+        )
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
         output = torch.matmul(A, dqB.to(A.dtype))
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 3976c4d5a..bc13963e6 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -5,9 +5,36 @@
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
+from .cpu_xpu_common import (
+    dequantize_4bit_impl,
+    double_quant_impl,
+    gemm_4bit_impl,
+    igemmlt_impl,
+    mm_dequant_impl,
+    quantize_4bit_impl,
+)
+
+Tensor = torch.Tensor
+
+
+def assert_on_xpu(tensors):
+    on_xpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_xpu &= t.device.type == "xpu"
+    if not on_xpu:
+        raise TypeError(
+            "All input tensors need to be on XPU, but found some tensors to not be on XPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_xpu
 
 
 class XPUBackend(Backend):
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
     def double_quant(
         self,
         A: torch.Tensor,
@@ -17,7 +44,9 @@ def double_quant(
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ):
-        raise NotImplementedError
+        assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
+        output = double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
+        return output
 
     def transform(
         self,
@@ -29,7 +58,23 @@ def transform(
         state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
-        raise NotImplementedError
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For XPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_xpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
 
     def igemmlt(
         self,
@@ -41,7 +86,9 @@ def igemmlt(
         Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
     ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
-        raise NotImplementedError
+        assert_on_xpu([A, B])
+        output = igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        return output
 
     def mm_dequant(
         self,
@@ -54,7 +101,20 @@ def mm_dequant(
         new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, row_stats, col_stats, out, bias])
+        output = mm_dequant_impl(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            self.mm_dequant_compute_dtype,
+            self.mm_dequant_output_dtype,
+        )
+        return output
 
     def extract_outliers(
         self,
@@ -62,7 +122,9 @@ def extract_outliers(
         SA: Tuple[torch.Size, str],
         idx: torch.Tensor,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A])
+        output = A[:, idx].contiguous()
+        return output
 
     def quantize_4bit(
         self,
@@ -74,7 +136,12 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        return output
 
     def dequantize_4bit(
         self,
@@ -85,7 +152,15 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 64
+        assert_on_xpu([A, absmax, out])
+        if quant_type == "nf4":
+            output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None, blocksize).t()
+        else:
+            output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
+
+        return output
 
     def gemv_4bit(
         self,
@@ -96,7 +171,11 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        assert_on_xpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+        output = gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
+        return output
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 6cf64df28..3c730cb16 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1006,11 +1006,6 @@ def dequantize_fp4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
 
@@ -1021,11 +1016,6 @@ def dequantize_nf4(
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
 ) -> Tensor:
-    if blocksize is None:
-        # Some AMD GPUs have warpsize 64
-        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
-        blocksize = 64 if not HIP_ENVIRONMENT else 128
-
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
 
@@ -1035,7 +1025,7 @@ def dequantize_4bit(
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
     blocksize: Optional[int] = None,
-    quant_type="fp4",
+    quant_type=None,
 ) -> Tensor:
     """
     Dequantizes FP4 blockwise quantized values.
@@ -1064,6 +1054,14 @@ def dequantize_4bit(
         Dequantized tensor.
     """
     ensure_backend_is_available(A.device.type)
+    if quant_state is not None:
+        absmax = absmax or quant_state.absmax
+        quant_type = quant_type or quant_state.quant_type
+        blocksize = blocksize or quant_state.blocksize
+    if blocksize is None:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
     return backends[A.device.type].dequantize_4bit(
         A, quant_state=quant_state, absmax=absmax, out=out, blocksize=blocksize, quant_type=quant_type
     )
@@ -1800,7 +1798,7 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        if values.device == torch.device("cpu"):
+        if values.device == torch.device("cpu") or torch.device("xpu"):
             assert values.dtype in [torch.bfloat16, torch.half, torch.float]
         else:
             assert values.dtype == torch.float16
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 96f4359bf..35bee393e 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from ..backends import backends
 from .modules import (
     Embedding,
     Int8Params,
@@ -14,9 +15,12 @@
     StableEmbedding,
     SwitchBackLinearBnb,
 )
-from .triton_based_modules import (
-    StandardLinear,
-    SwitchBackLinear,
-    SwitchBackLinearGlobal,
-    SwitchBackLinearVectorwise,
-)
+
+# CPU and XPU backend do not need triton, and XPU so not support triton for now.
+if "xpu" not in backends.keys() and len(backends.keys()) > 1:
+    from .triton_based_modules import (
+        StandardLinear,
+        SwitchBackLinear,
+        SwitchBackLinearGlobal,
+        SwitchBackLinearVectorwise,
+    )
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
old mode 100644
new mode 100755
index 32854413f..2159c21e4
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -314,6 +314,9 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b
     def cpu(self, non_blocking: bool = False):
         return self.to(device="cpu", non_blocking=non_blocking)
 
+    def xpu(self, non_blocking: bool = False):
+        return self.to(device="xpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -331,7 +334,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "xpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -417,6 +420,7 @@ def __init__(
         # self.persistent_buffers = []  # TODO consider as way to save quant state
         self.compute_dtype = compute_dtype
         self.compute_type_is_set = False
+        self.ipex_linear_is_set = False
         self.quant_state = None
         self.quant_storage = quant_storage
 
@@ -445,35 +449,39 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         save weight and bias,
         then fill state_dict with components of quant_state
         """
-        if (
-            getattr(self.weight, "quant_state", None) is not None
-            and getattr(self.weight.quant_state, "op_context", None) is not None
-        ):
-            context = self.weight.quant_state.op_context
-            self.weight.data = context.to_public(context.get_weight()).reshape([1, -1])
+        if getattr(self.weight, "quant_state", None) is not None and getattr(self.weight.quant_state, "ipex", False):
+            if self.weight.device.type == "cpu":
+                original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+                    self.weight, "nf4", self.weight.quant_state.shape, 2
+                )
+                self.weight.data = original_weight.data
+            elif self.weight.device.type == "xpu":
+                self.weight.data = self.weight.data.reshape(1, -1)
+
+            self.weight.quant_state.ipex = False
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
         if getattr(self.weight, "quant_state", None) is not None:
-            if (
-                self.weight.quant_state.absmax.shape.numel() == 0
-                and getattr(self.weight.quant_state, "op_context", None) is not None
-            ):
-                self.weight.quant_state.absmax = context.get_scales().reshape(-1)
-                delattr(self.weight.quant_state, "op_context")
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
-    def forward(self, x: torch.Tensor):
-        # Check if ipex fusion can be used
+    def set_ipex_linear(self, x: torch.Tensor):
         if (
-            x.device.type == "cpu"
-            and not hasattr(self.weight.quant_state, "op_context")
+            (x.device.type in ("cpu", "xpu"))
+            and not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
+            and not self.training
             and x.requires_grad == False
         ):
-            enable_ipex_fusion(self.weight, self.weight.quant_state)
+            enable_ipex_fusion(self)
+
+    def forward(self, x: torch.Tensor):
+        # Check if ipex fusion can be used
+        if not self.ipex_linear_is_set:
+            self.set_ipex_linear(x)
+            self.ipex_linear_is_set = True
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
@@ -633,7 +641,20 @@ def __deepcopy__(self, memo):
 
     def cpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().bfloat16().cpu()
+        B = self.data.contiguous().to(torch.bfloat16).cpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        self.CB = CB
+        self.SCB = SCB
+        return self
+
+    def xpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().to(torch.float16).xpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -669,6 +690,13 @@ def to(self, *args, **kwargs):
                 return self
             else:
                 return self.cpu()
+        elif device.type == "xpu":
+            if self.data.dtype == torch.int8:
+                self.data = self.data.contiguous().xpu()
+                self.CB = self.data
+                return self
+            else:
+                return self.xpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 9e52c915d..adb36279c 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,28 +200,39 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(weight, quant_state):
-    from bitsandbytes.backends.cpu_xpu_common import _ipex_cpu_version_prereq
-
-    if _ipex_cpu_version_prereq(2, 3):
-        import intel_extension_for_pytorch as ipex
-
-        lowp_mode = ipex.quantization.WoqLowpMode.BF16
-        quant_state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
-            weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
-            ipex.quantization.WoqWeightDtype.NF4,
+def enable_ipex_fusion(linear):
+    from bitsandbytes.backends.cpu_xpu_common import (
+        _ipex_cpu_version_prereq,
+        _ipex_xpu_version_prereq,
+        ipex_cpu_only,
+        ipex_xpu,
+    )
+
+    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
+            linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            "nf4",
             quant_state.shape,  # weight shape
             quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
             None,  # zero_points
             None,  # bias
-            None,  # g_idx
             None,  # batch_size
             quant_state.blocksize,
-            int(lowp_mode),
-            -1,  # act_quant_mode. -1 means don't quant activation
+            2,
         )
-        quant_state.absmax = torch.Tensor()
-        weight.data = torch.empty([1, 0], dtype=torch.uint8)
+    elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
+        quant_state = linear.weight.quant_state
+        new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
+
+        new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+        new_zeros = None
+        compensation = None
+    linear.weight.data = new_weight.data
+    linear.weight.quant_state.ipex = True
+    linear.weight.quant_state.new_scales = new_scales
+    linear.weight.quant_state.new_zeros = new_zeros
+    linear.weight.quant_state.compensation = compensation
 
 
 class QuantState:
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index d1acb2cd6..615dfd95e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -208,8 +208,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 |-------------|------------------------|---------------------------|-------------------------|------------|
 | **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
-| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
-| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
 
@@ -336,8 +336,6 @@ The below commands are for Linux. For installing on Windows, please adapt the be
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=cpu -S .
-make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 728606b7b..4c429fb2d 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -33,12 +33,12 @@ The following performance data is collected from Intel 4th Gen Xeon (SPR) platfo
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
+| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
 | Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
 
 #### Fine-Tuning (CPU)
 
-| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
+| Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
+| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
 | Memory (GB) | 40 | 9 | 6.6 | 6.6 |

From 931569217fba9423dc176cf2956b96c625a96d3c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 2 Dec 2024 16:55:21 +0800
Subject: [PATCH 203/234] fix cpu nf4 (#1432)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py | 3 ++-
 bitsandbytes/nn/modules.py          | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 9765def05..e188479f6 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -579,7 +579,8 @@ def matmul_4bit(
     assert quant_state is not None
     if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
         if getattr(quant_state, "ipex", False):
-            out = F.gemv_4bit(A, B.t(), out, state=quant_state)
+            B = B.t() if len(B.shape) == 2 else B
+            out = F.gemv_4bit(A, B, out, state=quant_state)
             if bias is not None:
                 out += bias
             return out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2159c21e4..66f14edf7 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -508,7 +508,8 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
+        weight = self.weight.t() if len(self.weight.shape) == 2 else self.weight
+        out = bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state)
 
         out = out.to(inp_dtype)
 

From 994833378a51a96db6a74ee8071654def47007b2 Mon Sep 17 00:00:00 2001
From: Huazhong Ji <hzji210@gmail.com>
Date: Fri, 6 Dec 2024 22:45:55 +0800
Subject: [PATCH 204/234] Add Ascend NPU support for nf4 quant (#1422)

* Add npu support for nf4 quant

Co-authored-by: Slightwind <slightwindsec@gmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>

* code format

* update

* pass lint check and fix typos

* add npu to supported devices

---------

Co-authored-by: Slightwind <slightwindsec@gmail.com>
Co-authored-by: Ginray <ginray0215@gmail.com>
---
 CMakeLists.txt                          |  49 +++++-
 _typos.toml                             |   3 +
 bitsandbytes/__init__.py                |   1 +
 bitsandbytes/autograd/_functions.py     |  14 +-
 bitsandbytes/backends/cpu_xpu_common.py |   2 +-
 bitsandbytes/backends/npu.py            | 152 ++++++++++++++--
 bitsandbytes/cextension.py              |   5 +
 bitsandbytes/nn/modules.py              |  10 +-
 bitsandbytes/npu_specs.py               |  20 +++
 csrc/npu_kernels.cpp                    | 222 ++++++++++++++++++++++++
 csrc/npu_ops.cpp                        |  51 ++++++
 csrc/npu_ops.h                          |  28 +++
 csrc/pythonInterface.cpp                |  11 ++
 docs/source/installation.mdx            |  33 ++++
 14 files changed, 581 insertions(+), 20 deletions(-)
 create mode 100644 bitsandbytes/npu_specs.py
 create mode 100644 csrc/npu_kernels.cpp
 create mode 100644 csrc/npu_ops.cpp
 create mode 100644 csrc/npu_ops.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 315e0ff1b..20dd2b45d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@
 #   For  GCC: `cmake -B build . && cmake --build build`
 #   For MSVC: `cmake -B build . && cmake --build build --config Release`
 # You can also use the following options and variables
-#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip` or `mps` to select the backend
+#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip`, `mps` or `npu` to select the backend
 #  - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
 #  - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
 #                  is whatever CMake finds on your path.
@@ -29,11 +29,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
+set(NPU_FILES csrc/npu_ops.cpp)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, npu)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps npu)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -69,6 +70,11 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS ON)
+elseif(${COMPUTE_BACKEND} STREQUAL "npu")
+    set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
+    set(BUILD_MPS OFF)
+    set(BUILD_NPU ON)
 else()
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
@@ -232,6 +238,33 @@ elseif(BUILD_MPS)
                 COMMENT "Compiling Metal kernels"
                 VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+elseif(BUILD_NPU)
+    list(APPEND SRC_FILES ${NPU_FILES})
+
+    set(SOC_VERSION "Ascend910B4" CACHE STRING "system on chip type")
+    set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE
+        STRING "ASCEND CAN package installation directory"
+    )
+
+    # ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+    # ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+    # file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/npu_kernels.cpp)
+    file(GLOB KERNEL_FILES csrc/npu_kernels.cpp)
+
+    if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    else()
+        message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the can package is installed")
+    endif()
+    include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+    # ascendc_library use to add kernel file to generate ascendc library
+    ascendc_library(ascendc_kernels_npu STATIC ${KERNEL_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_npu")
+    add_compile_definitions(BUILD_NPU)
 else()
     string(APPEND BNB_OUTPUT_NAME "_cpu")
     set(GPU_SOURCES)
@@ -249,7 +282,11 @@ endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
-target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+if(BUILD_NPU)
+    target_compile_features(bitsandbytes PUBLIC cxx_std_17)
+else()
+    target_compile_features(bitsandbytes PUBLIC cxx_std_14)
+endif()
 target_include_directories(bitsandbytes PUBLIC csrc include)
 
 
@@ -306,6 +343,10 @@ if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
+if(BUILD_NPU)
+    target_compile_options(bitsandbytes PRIVATE -O2 -std=c++17)
+    target_link_libraries(bitsandbytes PRIVATE $<BUILD_INTERFACE:host_intf_pub> ascendc_kernels_npu)
+endif()
 
 if(WIN32)
     set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
diff --git a/_typos.toml b/_typos.toml
index e4e7287fb..ff4c9ae06 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -3,12 +3,15 @@
 [default]
 extend-ignore-re = [
     "@Ther-nul",  # valid Github user
+    "CANN",  # CANN (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
 ]
 
 [default.extend-identifiers]
 
 [type.py.extend-words]
 "BA" = "BA"  # used as a commented-out variable in tests
+"cann" = "cann" # cann (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
+
 
 [type.cuda.extend-words]
 "subtile" = "subtile"
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index c705137c0..f850140a1 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -25,6 +25,7 @@
 features = {"multi_backend"}
 supported_torch_devices = {
     "cuda",  # includes ROCm
+    "npu",  # Ascend NPU
     "xpu",  # Intel GPU
     "cpu",
 }
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e188479f6..6440ab1b5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -519,7 +519,12 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
 
         # 1. Dequantize
         # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
+        if A.device.type == "npu":
+            output = torch.matmul(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t())
+            if bias is not None:
+                output += bias
+        else:
+            output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
 
         # 3. Save state
         ctx.state = quant_state
@@ -550,7 +555,10 @@ def backward(ctx, grad_output):
         # not supported by PyTorch. TODO: create work-around
         # if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
         if req_gradA:
-            grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
+            if grad_output.device.type == "npu":
+                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype))
+            else:
+                grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
 
         return grad_A, grad_B, None, grad_bias, None
 
@@ -586,7 +594,7 @@ def matmul_4bit(
             return out
         else:
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
-    elif A.numel() == A.shape[-1] and A.requires_grad == False:
+    elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "npu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index d2e0c2593..8fdf7569d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -23,7 +23,7 @@
 
 gxx_available = False
 try:
-    subprocess.run(["g++", "--version"])
+    subprocess.run(["g++", "--version"], capture_output=True)  # hide terminal output
     gxx_available = True
 except BaseException:
     warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
index 1b3cb57d6..ecbc2f351 100644
--- a/bitsandbytes/backends/npu.py
+++ b/bitsandbytes/backends/npu.py
@@ -1,17 +1,32 @@
+import ctypes as ct
 from typing import Literal, Optional, Tuple, Union
 
 import torch
 
-from bitsandbytes.utils import QuantState
-
-from .base import Backend
-
 try:
     # to support Ascend NPU backend
     import torch_npu  # noqa: F401
 except ImportError:
     pass
 
+from bitsandbytes.cextension import lib
+from bitsandbytes.functional import (
+    get_4bit_type,
+    get_ptr,
+)
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+def assert_on_npu(tensors):
+    if not all(t.device.type == "npu" for t in tensors if t is not None):
+        raise TypeError(
+            "All input tensors to be on NPU, but found some tensors not be on NPU:\n"
+            f"{[(t.shape, t.device) if isinstance(t, torch.Tensor) else None for t in tensors]}"
+        )
+    return True
+
 
 class NPUBackend(Backend):
     def double_quant(
@@ -75,12 +90,62 @@ def quantize_4bit(
         A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize=64,
+        blocksize: Optional[int] = None,
         compress_statistics=False,
-        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_type: Literal["fp4", "nf4"] = "nf4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        if quant_type not in ["nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
+        if compress_statistics:
+            raise NotImplementedError("compress_statistics is not implemented.")
+        if blocksize is None:
+            blocksize = 128
+
+        prev_device = torch.npu.current_device()
+        torch.npu.set_device(A.device)
+        if A.dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            data = [
+                -1.0,
+                -0.6961928009986877,
+                -0.5250730514526367,
+                -0.39491748809814453,
+                -0.28444138169288635,
+                -0.18477343022823334,
+                -0.09105003625154495,
+                0.0,
+                0.07958029955625534,
+                0.16093020141124725,
+                0.24611230194568634,
+                0.33791524171829224,
+                0.44070982933044434,
+                0.5626170039176941,
+                0.7229568362236023,
+                1.0,
+            ]
+            data = torch.tensor(data, device="npu", dtype=torch.float32).view(1, -1)
+            absmax = A.view(-1, blocksize).abs().max(dim=1, keepdim=True).values
+            a = A.view(-1, blocksize) / absmax.float()
+            diff = torch.abs(a.unsqueeze(-1) - data)
+            out = (torch.argmin(diff, dim=-1) + 8) % 16
+            out = out.reshape(-1, 2)
+            out = (out[:, 0] + out[:, 1] * 16).to(torch.uint8)
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        assert_on_npu([A, absmax, out])
+        torch.npu.set_device(prev_device)
+
+        code = get_4bit_type(quant_type, device=A.device)
+        state = QuantState(
+            absmax=absmax,
+            shape=A.shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+        return out, state
 
     def dequantize_4bit(
         self,
@@ -88,10 +153,77 @@ def dequantize_4bit(
         quant_state: Optional[QuantState] = None,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
-        quant_type: Literal["fp4", "nf4"] = "fp4",
+        blocksize: Optional[int] = None,
+        quant_type: Literal["fp4", "nf4"] = "nf4",
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if blocksize is None:
+            blocksize = 128
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if blocksize not in supported_blocksizes:
+            raise ValueError(
+                f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}"
+            )
+
+        if quant_state is None:
+            assert absmax is not None and out is not None
+            quant_state = QuantState(
+                absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type
+            )
+        else:
+            absmax = quant_state.absmax
+
+        if out is None:
+            out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+
+        n = out.numel()
+
+        prev_device = torch.npu.current_device()
+        torch.npu.set_device(A.device)
+        assert_on_npu([A, absmax, out])
+
+        if quant_state.quant_type not in ["nf4"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
+
+        if out.dtype == torch.float32:
+            lib.cdequantize_blockwise_fp32_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+        elif out.dtype == torch.float16:
+            lib.cdequantize_blockwise_fp16_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+        elif out.dtype == torch.bfloat16:
+            # bf16: bf16 -> fp32 -> op -> fp32 -> bf16
+            absmax = absmax.to(torch.float32)
+            out = out.to(torch.float32)
+            lib.cdequantize_blockwise_fp32_nf4(
+                get_ptr(A),
+                get_ptr(absmax),
+                get_ptr(out),
+                ct.c_int(quant_state.blocksize),
+                ct.c_int(n),
+                torch.npu.current_stream(),
+            )
+            out = out.to(torch.bfloat16)
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+        torch.npu.set_device(prev_device)
+        is_transposed = True if A.shape[0] == 1 else False
+
+        if is_transposed:
+            return out.t()
+        else:
+            return out
 
     def gemv_4bit(
         self,
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index cc5d8deff..ec329cbb6 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -25,6 +25,7 @@
 
 from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
 from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_rocm_gpu_arch
+from bitsandbytes.npu_specs import get_npu_specs
 
 logger = logging.getLogger(__name__)
 
@@ -100,6 +101,10 @@ def get_native_library() -> BNBNativeLibrary:
             binary_path = cuda_binary_path
         else:
             logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
+    npu_specs = get_npu_specs()
+    if npu_specs:
+        binary_path = PACKAGE_DIR / f"libbitsandbytes_npu{DYNAMIC_LIBRARY_SUFFIX}"
+
     logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
     dll = ct.cdll.LoadLibrary(str(binary_path))
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 66f14edf7..781e22541 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -314,6 +314,12 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b
     def cpu(self, non_blocking: bool = False):
         return self.to(device="cpu", non_blocking=non_blocking)
 
+    def npu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
+        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+        if isinstance(device, int):
+            device = f"npu:{device}"
+        return self.to(device="npu" if device is None else device, non_blocking=non_blocking)
+
     def xpu(self, non_blocking: bool = False):
         return self.to(device="xpu", non_blocking=non_blocking)
 
@@ -334,7 +340,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu", "xpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "npu", "xpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -497,7 +503,7 @@ def forward(self, x: torch.Tensor):
                 self.weight.quant_state = self.quant_state
             else:
                 print(
-                    "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.",
+                    "FP4 quantization state not initialized. Please call .cuda(), .npu() or .to(device) on the LinearFP4 layer first.",
                 )
         if not self.compute_type_is_set:
             self.set_compute_type(x)
diff --git a/bitsandbytes/npu_specs.py b/bitsandbytes/npu_specs.py
new file mode 100644
index 000000000..7c7cd707e
--- /dev/null
+++ b/bitsandbytes/npu_specs.py
@@ -0,0 +1,20 @@
+import dataclasses
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+except ImportError:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class NPUSpecs:
+    cann_version_string: str
+
+
+def get_npu_specs():
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        return NPUSpecs(cann_version_string=torch.version.cann)
+    else:
+        return None
diff --git a/csrc/npu_kernels.cpp b/csrc/npu_kernels.cpp
new file mode 100644
index 000000000..c70e71681
--- /dev/null
+++ b/csrc/npu_kernels.cpp
@@ -0,0 +1,222 @@
+#include "kernel_operator.h"
+#include "npu_ops.h"
+
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;
+
+constexpr half Q_COFF_0 = -0.377685546875;
+constexpr half Q_COFF_1 = -3.193359375;
+constexpr half Q_COFF_2 = 0.583984375;
+constexpr half Q_COFF_3 = 6.02734375;
+constexpr half Q_COFF_4 = 1.9560546875;
+constexpr half Q_COFF_5 = 7.08984375;
+
+#define CEIL32(num) (((num) + 32 - 1) / 32 * 32)
+#define CEIL_BASE(num, base) (((num) + (base) - 1) / (base) * (base))
+
+
+template <typename T, uint32_t TypeMode>
+class KernelDequantizeBlockwiseNf4 {
+public:
+    __aicore__ inline KernelDequantizeBlockwiseNf4() {}
+
+    __aicore__ inline void Init(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tilingDevice, TPipe &pipe)
+    {
+        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        auto *tiling_data = reinterpret_cast<__gm__ BlockwiseNf4TilingData *>(tilingDevice);
+        this->blocksize = tiling_data->blocksize;
+        uint32_t coreNum = tiling_data->coreNum;
+        uint32_t singleCoreNumel = tiling_data->singleCoreNumel;
+        uint32_t singleCoreNumelTail = tiling_data->singleCoreNumelTail;
+        uint32_t numel = tiling_data->numel;
+        uint32_t ubSize = tiling_data->ubSize;
+        uint32_t blockIdx = (uint32_t)GetBlockIdx();
+        if (coreNum - blockIdx == 1) {
+            this->CurCoreFP16Num = singleCoreNumelTail;
+        } else {
+            this->CurCoreFP16Num = singleCoreNumel;
+        }
+        constexpr uint32_t ELEMENT_BYTES = (TypeMode == 1) ? 4 : 2;  // FP32: 4bytes, FP16/BF16: 2bytes
+        uint32_t eachBatchPkgNum = (ubSize - 16 * ELEMENT_BYTES) /
+            (this->blocksize / 2 * BUFFER_NUM + ELEMENT_BYTES * BUFFER_NUM + this->blocksize *
+            (ELEMENT_BYTES * BUFFER_NUM + sizeof(half) + sizeof(uint32_t) + ELEMENT_BYTES));
+        if (eachBatchPkgNum >= 32 / ELEMENT_BYTES) {
+            eachBatchPkgNum = (eachBatchPkgNum / (32 / ELEMENT_BYTES)) * (32 / ELEMENT_BYTES);
+        } else {
+            eachBatchPkgNum = (eachBatchPkgNum / 2) * 2;
+        }
+        this->eachBatchFP16Num = this->blocksize * eachBatchPkgNum; // 64 * 288
+
+        // gm, 32-byte alignment
+        uint32_t AOffset = singleCoreNumel / 2 * blockIdx;
+        uint32_t ABufferSize = singleCoreNumel / 2;
+        AGm.SetGlobalBuffer((__gm__ int8_t*)A + AOffset, ABufferSize);
+        uint32_t absmaxOffset = singleCoreNumel / this->blocksize * blockIdx;
+        uint32_t absmaxBufferSize = singleCoreNumel / this->blocksize;
+        absmaxGm.SetGlobalBuffer((__gm__ T*)absmax + absmaxOffset, absmaxBufferSize);
+        uint32_t outOffset = singleCoreNumel * blockIdx;
+        uint32_t outBufferSize = singleCoreNumel;
+        outGm.SetGlobalBuffer((__gm__ T*)out + outOffset, outBufferSize);
+
+        // TQue, 32-byte alignment
+        pipe.InitBuffer(inQueueA, BUFFER_NUM, this->eachBatchFP16Num / 2);
+        pipe.InitBuffer(inQueueAbsmax, BUFFER_NUM, CEIL32(eachBatchPkgNum * ELEMENT_BYTES));
+        pipe.InitBuffer(outQueueOut, BUFFER_NUM, this->eachBatchFP16Num * ELEMENT_BYTES);
+
+        // TBuf, 32-byte alignment
+        pipe.InitBuffer(calcNf4ToFloat, 16 * ELEMENT_BYTES);
+        pipe.InitBuffer(calcAFP16, this->eachBatchFP16Num * sizeof(half));
+        pipe.InitBuffer(calcAUint32, this->eachBatchFP16Num * sizeof(uint32_t));
+        pipe.InitBuffer(calcAbsmaxBuf, this->eachBatchFP16Num * ELEMENT_BYTES);
+    }
+
+    __aicore__ inline void Process(void)
+    {
+        Compute();
+    }
+
+private:
+    __aicore__ inline void initNf4ToFloat(LocalTensor<T> &nf4ToFloat)
+    {
+        if constexpr (TypeMode == 1) {
+            nf4ToFloat(0) = static_cast<float32_t>(-1.0);
+            nf4ToFloat(1) = static_cast<float32_t>(-0.6961928009986877);
+            nf4ToFloat(2) = static_cast<float32_t>(-0.5250730514526367);
+            nf4ToFloat(3) = static_cast<float32_t>(-0.39491748809814453);
+            nf4ToFloat(4) = static_cast<float32_t>(-0.28444138169288635);
+            nf4ToFloat(5) = static_cast<float32_t>(-0.18477343022823334);
+            nf4ToFloat(6) = static_cast<float32_t>(-0.09105003625154495);
+            nf4ToFloat(7) = static_cast<float32_t>(0.0);
+            nf4ToFloat(8) = static_cast<float32_t>(0.07958029955625534);
+            nf4ToFloat(9) = static_cast<float32_t>(0.16093020141124725);
+            nf4ToFloat(10) = static_cast<float32_t>(0.24611230194568634);
+            nf4ToFloat(11) = static_cast<float32_t>(0.33791524171829224);
+            nf4ToFloat(12) = static_cast<float32_t>(0.44070982933044434);
+            nf4ToFloat(13) = static_cast<float32_t>(0.5626170039176941);
+            nf4ToFloat(14) = static_cast<float32_t>(0.7229568362236023);
+            nf4ToFloat(15) = static_cast<float32_t>(1.0);
+        } else if constexpr (TypeMode == 2) {
+            nf4ToFloat(0) = static_cast<half>(-1.0);
+            nf4ToFloat(1) = static_cast<half>(-0.6962890625);
+            nf4ToFloat(2) = static_cast<half>(-0.52490234375);
+            nf4ToFloat(3) = static_cast<half>(-0.39501953125);
+            nf4ToFloat(4) = static_cast<half>(-0.284423828125);
+            nf4ToFloat(5) = static_cast<half>(-0.184814453125);
+            nf4ToFloat(6) = static_cast<half>(-0.091064453125);
+            nf4ToFloat(7) = static_cast<half>(0.0);
+            nf4ToFloat(8) = static_cast<half>(0.07958984375);
+            nf4ToFloat(9) = static_cast<half>(0.160888671875);
+            nf4ToFloat(10) = static_cast<half>(0.24609375);
+            nf4ToFloat(11) = static_cast<half>(0.337890625);
+            nf4ToFloat(12) = static_cast<half>(0.440673828125);
+            nf4ToFloat(13) = static_cast<half>(0.5625);
+            nf4ToFloat(14) = static_cast<half>(0.72314453125);
+            nf4ToFloat(15) = static_cast<half>(1.0);
+        }
+    }
+
+    __aicore__ inline void Compute(void)
+    {
+        constexpr uint32_t ELEMENT_BYTES = (TypeMode == 1) ? 4 : 2;  // FP32: 4bytes, FP16/BF16: 2bytes
+        LocalTensor<int8_t> ALocal = inQueueA.AllocTensor<int8_t>();
+        LocalTensor<T> absmaxLocal = inQueueAbsmax.AllocTensor<T>();
+        LocalTensor<T> outLocal = outQueueOut.AllocTensor<T>();
+
+        LocalTensor<half> AFP16 = calcAFP16.Get<half>();
+        LocalTensor<int32_t> AInt32 = calcAUint32.Get<int32_t>();
+        LocalTensor<T> absmaxBuf = calcAbsmaxBuf.Get<T>();
+        LocalTensor<T> nf4ToFloat = calcNf4ToFloat.Get<T>();
+        initNf4ToFloat(nf4ToFloat);
+
+        DataCopyParams dataCopyParams = {1, 0, 0, 0};
+        uint32_t curBatchNumel = this->eachBatchFP16Num;
+        uint32_t curBatchPkgNum = curBatchNumel / this->blocksize;
+
+        uint32_t batchCount = (this->CurCoreFP16Num + this->eachBatchFP16Num - 1) / this->eachBatchFP16Num;
+        for (uint32_t batchIdx = 0; batchIdx < batchCount; batchIdx++) {
+            if (batchCount - batchIdx == 1) {
+                curBatchNumel = this->CurCoreFP16Num - this->eachBatchFP16Num * batchIdx;
+                curBatchPkgNum = (curBatchNumel + this->blocksize - 1) / this->blocksize;
+            }
+
+            dataCopyParams.blockLen = curBatchNumel / 2;  // Byte
+            DataCopyPad(ALocal, AGm[this->eachBatchFP16Num / 2 * batchIdx], dataCopyParams, {true, 0, 0, 0});
+            dataCopyParams.blockLen = ELEMENT_BYTES * curBatchPkgNum;  // Byte
+            uint32_t gmOffset = this->eachBatchFP16Num / this->blocksize * batchIdx;
+            DataCopyPad(absmaxLocal, absmaxGm[gmOffset], dataCopyParams, {true, 0, 0, 0});
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            pipe_barrier(PIPE_ALL);
+
+            LocalTensor<int4b_t> AInt4 = ALocal.ReinterpretCast<int4b_t>();
+            Cast(AFP16, AInt4, RoundMode::CAST_NONE, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            Adds(AFP16, AFP16, static_cast<half>(8), curBatchNumel);
+            pipe_barrier(PIPE_V);
+            if constexpr (TypeMode == 1) {
+                Muls(AFP16, AFP16, static_cast<half>(4), curBatchNumel);
+            } else {
+                Muls(AFP16, AFP16, static_cast<half>(2), curBatchNumel);
+            }
+            pipe_barrier(PIPE_V);
+            Cast(AInt32, AFP16, RoundMode::CAST_ROUND, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            LocalTensor<uint32_t> AUint32 = AInt32.ReinterpretCast<uint32_t>();
+            Gather<T>(outLocal, nf4ToFloat, AUint32, 0, curBatchNumel);
+            pipe_barrier(PIPE_V);
+            uint32_t dstShape[] = {curBatchPkgNum, this->blocksize};
+            uint32_t srcShape[] = {curBatchPkgNum, 1};
+            BroadCast<T, 2, 1>(absmaxBuf, absmaxLocal, dstShape, srcShape);
+            pipe_barrier(PIPE_ALL);
+            Mul(outLocal, outLocal, absmaxBuf, curBatchNumel);
+            pipe_barrier(PIPE_ALL);
+
+            dataCopyParams.blockLen = ELEMENT_BYTES * curBatchNumel;  // Byte
+            DataCopyPad(outGm[batchIdx * this->eachBatchFP16Num], outLocal, dataCopyParams);
+            pipe_barrier(PIPE_MTE3);
+        }
+        pipe_barrier(PIPE_ALL);
+
+        inQueueA.FreeTensor(ALocal);
+        inQueueAbsmax.FreeTensor(absmaxLocal);
+        outQueueOut.FreeTensor(outLocal);
+    }
+
+private:
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueA;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueAbsmax;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueOut;
+    TBuf<TPosition::VECCALC> calcAFP16;
+    TBuf<TPosition::VECCALC> calcAUint32;
+    TBuf<TPosition::VECCALC> calcNf4ToFloat;
+    TBuf<TPosition::VECCALC> calcAbsmaxBuf;
+    GlobalTensor<int8_t> AGm;
+    GlobalTensor<T> absmaxGm;
+    GlobalTensor<T> outGm;
+    uint32_t blocksize;
+    uint32_t CurCoreFP16Num;
+    uint32_t eachBatchFP16Num;
+};
+
+
+
+extern "C" {
+
+__global__ __aicore__ void dequantize_blockwise_fp32_nf4(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tiling)
+{
+    TPipe pipe;
+    KernelDequantizeBlockwiseNf4<float32_t, 1> op;
+    op.Init(A, absmax, out, tiling, pipe);
+    op.Process();
+}
+
+__global__ __aicore__ void dequantize_blockwise_fp16_nf4(GM_ADDR A, GM_ADDR absmax, GM_ADDR out, GM_ADDR tiling)
+{
+    TPipe pipe;
+    KernelDequantizeBlockwiseNf4<half, 2> op;
+    op.Init(A, absmax, out, tiling, pipe);
+    op.Process();
+}
+
+}
diff --git a/csrc/npu_ops.cpp b/csrc/npu_ops.cpp
new file mode 100644
index 000000000..fb5ecef2f
--- /dev/null
+++ b/csrc/npu_ops.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include "acl/acl.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "npu_ops.h"
+
+#include "aclrtlaunch_dequantize_blockwise_fp32_nf4.h"
+#include "aclrtlaunch_dequantize_blockwise_fp16_nf4.h"
+
+
+extern "C" {
+
+int32_t get_dequantize_blockwise_nf4_tiling(uint32_t blocksize, uint32_t n, BlockwiseNf4TilingData *tiling) {
+    tiling->ubSize = 196 * 1024;
+    uint32_t coreNum = 40;
+    uint32_t totalPkgNum = (n + blocksize - 1) / blocksize;
+    uint32_t singleCorePkgNum = (totalPkgNum + coreNum - 1) / coreNum;
+    coreNum = (totalPkgNum + singleCorePkgNum - 1) / singleCorePkgNum;
+    uint32_t singleCoreNumel = singleCorePkgNum * blocksize;
+    uint32_t singleCoreNumelTail = n % singleCoreNumel;
+    if (singleCoreNumelTail == 0) {
+        singleCoreNumelTail = singleCoreNumel;
+    }
+    tiling->coreNum = coreNum;
+    tiling->blocksize = blocksize;
+    tiling->numel = n;
+    tiling->singleCoreNumel = singleCoreNumel;
+    tiling->singleCoreNumelTail = singleCoreNumelTail;
+    return 0;
+}
+
+void dequantizeBlockwiseNf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream, const uint32_t type_mode) {
+    uint32_t blockDim = 40;
+    size_t tilingSize = sizeof(struct BlockwiseNf4TilingData);
+    BlockwiseNf4TilingData *tilingHost;
+    tilingHost = (struct BlockwiseNf4TilingData *)malloc(tilingSize);
+    uint32_t error = get_dequantize_blockwise_nf4_tiling(blocksize, n, tilingHost);
+    if (error != 0) {
+        printf("[!] error\n");
+    }
+    uint8_t *tilingDevice = nullptr;
+    aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_NORMAL_ONLY);
+    aclrtMemcpyAsync((void *)tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+    if (type_mode == 1) {
+        ACLRT_LAUNCH_KERNEL(dequantize_blockwise_fp32_nf4)(blockDim, stream, A, absmax, out, tilingDevice);
+    } else if (type_mode == 2) {
+        ACLRT_LAUNCH_KERNEL(dequantize_blockwise_fp16_nf4)(blockDim, stream, A, absmax, out, tilingDevice);
+    }
+    aclrtFree(tilingDevice);
+}
+
+}
diff --git a/csrc/npu_ops.h b/csrc/npu_ops.h
new file mode 100644
index 000000000..d7a26cd34
--- /dev/null
+++ b/csrc/npu_ops.h
@@ -0,0 +1,28 @@
+#ifndef NPU_OPS_H
+#define NPU_OPS_H
+#include <cstdint>
+
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+
+struct BlockwiseNf4TilingData {
+    uint32_t coreNum;
+    uint32_t blocksize;
+    uint32_t numel;
+    uint32_t singleCoreNumel;
+    uint32_t singleCoreNumelTail;
+    uint32_t ubSize;
+};
+
+extern "C" {
+
+void dequantizeBlockwiseNf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream, const uint32_t type_mode);
+
+}
+#endif
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index be6abc070..2d3031936 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -12,6 +12,9 @@
 #if BUILD_MPS
 // #include <mps_ops.h>
 #endif
+#if BUILD_NPU
+#include <npu_ops.h>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -601,6 +604,14 @@ extern "C"
 
 #endif
 
+#if BUILD_NPU
+	void cdequantize_blockwise_fp32_nf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream)
+	{ dequantizeBlockwiseNf4(A, absmax, out, blocksize, n, stream, 1); }
+
+	void cdequantize_blockwise_fp16_nf4(uint8_t *A, uint8_t *absmax, uint8_t *out, uint32_t blocksize, uint32_t n, void* stream)
+	{ dequantizeBlockwiseNf4(A, absmax, out, blocksize, n, stream, 2); }
+#endif
+
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 615dfd95e..79613856f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -210,6 +210,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 | **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
 | **Intel CPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
 | **Intel GPU** | v2.5.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+| **Ascend NPU** | 2.1.0+ (`torch_npu`)         | 3.10+                     | Ascend NPU | Experimental |
 
 For each supported backend, follow the respective instructions below:
 
@@ -251,6 +252,13 @@ Compatible hardware and functioning `import intel_extension_for_pytorch as ipex`
 
 Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
 
+</hfoption>
+<hfoption id="Ascend NPU">
+
+Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
+
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 
@@ -339,6 +347,31 @@ pip install -r requirements-dev.txt
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
+</hfoption>
+<hfoption id="Ascend NPU">
+
+#### Ascend NPU
+
+> [!TIP]
+> Ascend NPU backend only supports building from source; for now, please follow the instructions below.
+
+
+```
+# Install bitsandbytes from source
+# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
+cmake -DCOMPUTE_BACKEND=npu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">
 

From 7e6f8657abd4b9547031585c0dc2af50a3160e80 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Dec 2024 00:27:14 +0800
Subject: [PATCH 205/234] fix device check (#1453)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 2 +-
 bitsandbytes/utils.py      | 8 ++++----
 setup.py                   | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 781e22541..ad5a7d443 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -481,7 +481,7 @@ def set_ipex_linear(self, x: torch.Tensor):
             and not self.training
             and x.requires_grad == False
         ):
-            enable_ipex_fusion(self)
+            enable_ipex_fusion(self, x)
 
     def forward(self, x: torch.Tensor):
         # Check if ipex fusion can be used
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index adb36279c..02c9ac2ca 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,15 +200,15 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
-def enable_ipex_fusion(linear):
+def enable_ipex_fusion(linear, x):
     from bitsandbytes.backends.cpu_xpu_common import (
         _ipex_cpu_version_prereq,
         _ipex_xpu_version_prereq,
-        ipex_cpu_only,
+        ipex_cpu,
         ipex_xpu,
     )
 
-    if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5):
+    if x.device.type == "cpu" and ipex_cpu and _ipex_cpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
             linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
@@ -221,7 +221,7 @@ def enable_ipex_fusion(linear):
             quant_state.blocksize,
             2,
         )
-    elif ipex_xpu and _ipex_xpu_version_prereq(2, 5):
+    elif x.device.type == "xpu" and ipex_xpu and _ipex_xpu_version_prereq(2, 5):
         quant_state = linear.weight.quant_state
         new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
 
diff --git a/setup.py b/setup.py
index e8d3f547c..4002ee268 100644
--- a/setup.py
+++ b/setup.py
@@ -27,8 +27,8 @@ def get_latest_semver_tag():
     tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
     semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
     if not semver_tags:
-        print("No valid semantic version tags found, use 0.0.1 defaultly")
-        semver_tags = ["0.0.1"]
+        print("No valid semantic version tags found, use 1.0.0 defaultly")
+        semver_tags = ["1.0.0"]
     return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
 
 

From f6025bcae1395b1f3b5a993c4ac3ddde1dfda699 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 23 Jan 2025 05:17:34 +0800
Subject: [PATCH 206/234] Enable double quant on Intel CPU and XPU (#1472)

* fix dequant 8bit

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* support double quant on intel cpu and xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix shape

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 4bit format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix device error for xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix 4bit tensor shape

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix nf4 xpu finetune

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 71 ++++++++++++++++++-------
 bitsandbytes/backends/xpu.py            |  2 +-
 bitsandbytes/nn/modules.py              |  5 +-
 bitsandbytes/utils.py                   | 31 +++++++++--
 4 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 8fdf7569d..75f647939 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -3,11 +3,14 @@
 import warnings
 
 import torch
+import torch.nn.functional as F
 
 from bitsandbytes.functional import (
     QuantState,
+    create_dynamic_map,
     get_4bit_type,
 )
+from bitsandbytes.utils import reverse_4bit_compress_format
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -279,8 +282,9 @@ def mm_dequant_impl(
     0.8333333: 3,  # 0b0011
 }
 
+INT8_QUANT_TABLE = create_dynamic_map().tolist()
+
 
-@_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,
@@ -314,7 +318,7 @@ def quantize_4bit_impl(
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    if quant_type not in ["nf4", "fp4"]:
+    if quant_type not in ["nf4", "fp4", "int8"]:
         raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
     if quant_type == "fp4":
         warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
@@ -355,14 +359,34 @@ def quantize_4bit_impl(
         for key, val in FP4_QUANT_TABLE.items():
             out_uint8[abs_scaled_A > key] = val
         out_uint8 += sign.to(torch.uint8) * 8
-    if out_uint8.size(-1) % 2:
-        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
-    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+    elif quant_type == "int8":
+        for i in range(len(INT8_QUANT_TABLE)):
+            out_uint8[scaled_A > INT8_QUANT_TABLE[i]] = i
 
-    code = get_4bit_type(quant_type, device=A.device)
+    if quant_type == "int8":
+        out = out_uint8
+        code = torch.Tensor(INT8_QUANT_TABLE).to(A.device)
+    else:
+        if out_uint8.size(-1) % 2:
+            out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+        out[:] = out_uint8[::2].bitwise_left_shift(4).bitwise_or_(out_uint8[1::2])
+        code = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
-        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+        offset = absmax.mean()
+        absmax -= offset
+        qabsmax, state2 = quantize_4bit_impl(absmax, blocksize=256, quant_type="int8")
+        del absmax
+        state = QuantState(
+            absmax=qabsmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+            offset=offset,
+            state2=state2,
+        )
     else:
         state = QuantState(
             absmax=absmax,
@@ -373,7 +397,21 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    return out.unsqueeze(0), state
+    return out.reshape(-1, 1), state
+
+
+def dequant_8bit(A, offset, quant_state):
+    assert A.dtype == torch.uint8
+    absmax = quant_state.code[A.reshape(-1).int()]
+    blocks = absmax.shape[-1] // 256
+    res = absmax.shape[-1] % 256
+    if res != 0:
+        absmax = F.pad(absmax, (0, 256 - res), mode="constant", value=0)
+    absmax = (absmax.view(-1, 256) * quant_state.absmax.view(-1, 1)).to(quant_state.dtype).reshape(-1)
+    absmax = absmax[: blocks * 256 + res]
+    absmax = absmax.reshape(A.shape)
+    absmax += offset
+    return absmax
 
 
 @_maybe_torch_compile
@@ -411,12 +449,8 @@ def dequantize_4bit_impl(
     torch.Tensor:
         Dequantized tensor.
     """
-    if A.shape[0] == 1:
-        transpose = False
-        A = A.squeeze(0)
-    elif A.shape[1] == 1:
-        transpose = True
-        A = A.squeeze(1)
+    transpose = True if A.shape[0] == 1 else False
+    A = A.reshape(-1)
 
     if quant_state is None:
         assert absmax is not None and out is not None
@@ -438,17 +472,18 @@ def dequantize_4bit_impl(
         )
 
     if quant_state.nested:
-        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+        absmax = dequant_8bit(absmax, quant_state.offset, quant_state.state2)
 
     if ipex_cpu_only and _ipex_cpu_version_prereq(2, 5) and getattr(quant_state, "ipex", False):
-        A = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        ipex_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", quant_state.shape, 2)
+        A = reverse_4bit_compress_format(ipex_weight)
         quant_state.ipex = False
 
     # Map nf4 to [-1, 1]
     out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
     n = out_dq.numel()
-    out_dq[::2] = A & 0xF
-    out_dq[1::2] = A >> 4
+    out_dq[1::2] = A & 0xF
+    out_dq[::2] = A >> 4
     # quant_state.code is fp32, cast to quant_state dtype to avoid the mismatch issue
     quant_state.code = quant_state.code.to(quant_state.dtype)
     out_dq = quant_state.code[out_dq]
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index bc13963e6..aca0a0103 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -155,7 +155,7 @@ def dequantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        if quant_type == "nf4":
+        if quant_type == "nf4" and getattr(quant_state, "ipex", False):
             output = torch.ops.torch_ipex.dequantize_4bit(A, "nf4", quant_state.shape, absmax, None, blocksize).t()
         else:
             output = dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ad5a7d443..2320ffd39 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -20,6 +20,7 @@
     LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
     OutlierTracer,
     enable_ipex_fusion,
+    reverse_4bit_compress_format,
 )
 
 T = TypeVar("T", bound="torch.nn.Module")
@@ -460,9 +461,9 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                 original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
                     self.weight, "nf4", self.weight.quant_state.shape, 2
                 )
-                self.weight.data = original_weight.data
+                self.weight.data = reverse_4bit_compress_format(original_weight.data)
             elif self.weight.device.type == "xpu":
-                self.weight.data = self.weight.data.reshape(1, -1)
+                self.weight.data = reverse_4bit_compress_format(self.weight.data.reshape(1, -1))
 
             self.weight.quant_state.ipex = False
 
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 02c9ac2ca..e3748685e 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -200,18 +200,35 @@ def unpack_tensor_to_dict(tensor_data):
     return unpacked_dict
 
 
+def reverse_4bit_compress_format(weight):
+    out_1 = torch.empty(weight.size(0), dtype=torch.int32, device=weight.device)
+    out_2 = torch.empty(weight.size(0), dtype=torch.int32, device=weight.device)
+    out_1 = (weight & 0xF0) >> 4
+    out_2 = (weight & 0xF) << 4
+    out = out_1 | out_2
+    return out
+
+
 def enable_ipex_fusion(linear, x):
     from bitsandbytes.backends.cpu_xpu_common import (
         _ipex_cpu_version_prereq,
         _ipex_xpu_version_prereq,
+        dequant_8bit,
         ipex_cpu,
         ipex_xpu,
     )
 
+    quant_state = linear.weight.quant_state
+
+    if quant_state.nested:
+        quant_state.absmax = dequant_8bit(quant_state.absmax, quant_state.offset, quant_state.state2)
+        quant_state.nested = False
+        delattr(quant_state, "state2")
+
     if x.device.type == "cpu" and ipex_cpu and _ipex_cpu_version_prereq(2, 5):
-        quant_state = linear.weight.quant_state
+        converted_weight = reverse_4bit_compress_format(linear.weight.data)
         new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
-            linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+            converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
             "nf4",
             quant_state.shape,  # weight shape
             quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize),  # scales
@@ -222,12 +239,16 @@ def enable_ipex_fusion(linear, x):
             2,
         )
     elif x.device.type == "xpu" and ipex_xpu and _ipex_xpu_version_prereq(2, 5):
-        quant_state = linear.weight.quant_state
-        new_weight = linear.weight.data.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
-
+        converted_weight = reverse_4bit_compress_format(linear.weight.data)
+        new_weight = converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
         new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
         new_zeros = None
         compensation = None
+    else:
+        raise ValueError(
+            "Please check the device and ipex version. The device should be cpu or xpu while ipex version should >= 2.5"
+        )
+
     linear.weight.data = new_weight.data
     linear.weight.quant_state.ipex = True
     linear.weight.quant_state.new_scales = new_scales

From 307fbd52bdc3734130b505781b72a1e15cf83e0c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 Jan 2025 00:31:11 +0800
Subject: [PATCH 207/234] Enable dequant+matmul 8bit path for Intel CPU and XPU
 (#1484)

* new matmul8bit

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix cxb

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/autograd/_functions.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 6440ab1b5..9de5a8924 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -563,6 +563,29 @@ def backward(ctx, grad_output):
         return grad_A, grad_B, None, grad_bias, None
 
 
+class MatMul8bitFp(torch.autograd.Function):
+    # For Intel CPU and XPU, the double quant has many unsafe operations which will breaks the finetune.
+    # We'd like to use dequant + matmul to run finetune currently.
+
+    @staticmethod
+    def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
+        CB = B.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)).t()
+        output = torch.matmul(A, CB).to(A.dtype)
+        ctx.state = state
+        ctx.dtype_A = A.dtype
+        ctx.grad_shape = A.shape
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        state = ctx.state
+        B = state.CxB if state.CxB is not None else state.CB
+        CB = B.to(ctx.dtype_A).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
+        grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
+
+        return grad_A, None, None, None, None
+
+
 def matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -574,6 +597,8 @@ def matmul(
     state = state or MatmulLtState()
     if threshold > 0.0:
         state.threshold = threshold
+    if A.device.type in ("cpu", "xpu") and state.is_training:
+        return MatMul8bitFp.apply(A, B, out, bias, state)
     return MatMul8bitLt.apply(A, B, out, bias, state)
 
 

From a0a95fd70e1c988043b7a674b882841ba21bf4be Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 29 Jan 2025 00:34:39 +0800
Subject: [PATCH 208/234] add device index (#1489)

---
 bitsandbytes/nn/modules.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2320ffd39..81404179d 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -660,9 +660,9 @@ def cpu(self):
         self.SCB = SCB
         return self
 
-    def xpu(self):
+    def xpu(self, device):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().to(torch.float16).xpu()
+        B = self.data.contiguous().to(torch.float16).xpu(device)
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -700,11 +700,11 @@ def to(self, *args, **kwargs):
                 return self.cpu()
         elif device.type == "xpu":
             if self.data.dtype == torch.int8:
-                self.data = self.data.contiguous().xpu()
+                self.data = self.data.contiguous().xpu(device)
                 self.CB = self.data
                 return self
             else:
-                return self.xpu()
+                return self.xpu(device)
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),

From ed2a58d2c416346d903e2dfe8bfe6912bb618b3a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 7 Feb 2025 14:46:25 -0500
Subject: [PATCH 209/234] Update base backend docstrings

---
 bitsandbytes/backends/base.py | 332 ++++++++++++++++++++++++----------
 1 file changed, 239 insertions(+), 93 deletions(-)

diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
index 118b24be6..818897ed2 100644
--- a/bitsandbytes/backends/base.py
+++ b/bitsandbytes/backends/base.py
@@ -9,17 +9,132 @@
 class Backend(ABC):
     """Base class for devices backends that will implement their own 8bits and 4bits functions."""
 
-    # @abstractmethod
-    # def double_quant(
-    #     self,
-    #     A: torch.Tensor,
-    #     col_stats: Optional[torch.Tensor] = None,
-    #     row_stats: Optional[torch.Tensor] = None,
-    #     out_col: Optional[torch.Tensor] = None,
-    #     out_row: Optional[torch.Tensor] = None,
-    #     threshold=0.0,
-    # ):
-    #     raise NotImplementedError
+    @abstractmethod
+    def int8_double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
+
+        The statistics are determined both row-wise and column-wise (transposed).
+
+        For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
+
+        <Tip>
+        This function is useful for training, but for inference it is advised to use [`int8_vectorwise_quant`] instead.
+        This implementation performs additional column-wise transposed calculations which are not optimized.
+        </Tip>
+
+        Args:
+            A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
+            col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
+            row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
+            out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
+            out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
+            threshold (`float`, *optional*):
+                An optional threshold for sparse decomposition of outlier features.
+
+                No outliers are held back when 0.0. Defaults to 0.0.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
+            - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
+            - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
+            - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
+            - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
+            - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
+        """
+        ...
+
+    @abstractmethod
+    def int8_linear_matmul(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        dtype=torch.int32,
+    ) -> torch.Tensor:
+        """Performs an 8-bit integer matrix multiplication.
+
+        A linear transformation is applied such that `out = A @ B.T`. When possible, integer tensor core hardware is
+        utilized to accelerate the operation.
+
+        Args:
+            A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
+            B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
+            out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
+            dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.
+
+        Raises:
+            `NotImplementedError`: The operation is not supported in the current environment.
+            `RuntimeError`: Raised when the cannot be completed for any other reason.
+
+        Returns:
+            `torch.Tensor`: The result of the operation.
+        """
+        ...
+
+    @abstractmethod
+    def int8_mm_dequant(
+        self,
+        A: torch.Tensor,
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Performs dequantization on the result of a quantized int8 matrix multiplication.
+
+        Args:
+            A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
+            row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
+            col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
+            out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
+            bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.
+
+        Returns:
+            `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
+        """
+        ...
+
+    @abstractmethod
+    def int8_vectorwise_dequant(self, A: torch.Tensor, stats: torch.Tensor):
+        """Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.
+
+        Args:
+            A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
+            stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.
+
+        Returns:
+            `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
+        """
+        ...
+
+    @abstractmethod
+    def int8_vectorwise_quant(self, A: torch.Tensor, threshold=0.0):
+        """Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.
+
+        For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
+
+        Args:
+            A (`torch.Tensor` with dtype `torch.float16`): The input tensor.
+            threshold (`float`, *optional*):
+                An optional threshold for sparse decomposition of outlier features.
+
+                No outliers are held back when 0.0. Defaults to 0.0.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
+            - `torch.Tensor` with dtype `torch.int8`: The quantized data.
+            - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
+            - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
+        """
+        ...
 
     @abstractmethod
     def transform(
@@ -34,33 +149,6 @@ def transform(
     ):
         raise NotImplementedError
 
-    # @abstractmethod
-    # def igemmlt(
-    #     self,
-    #     A: torch.Tensor,
-    #     B: torch.Tensor,
-    #     SA: Tuple[torch.Size, str],
-    #     SB: Tuple[torch.Size, str],
-    #     out: Optional[torch.Tensor] = None,
-    #     Sout: Optional[Tuple[torch.Size, str]] = None,
-    #     dtype=torch.int32,
-    # ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
-    #     raise NotImplementedError
-
-    # @abstractmethod
-    # def mm_dequant(
-    #     self,
-    #     A: torch.Tensor,
-    #     quant_state: Tuple[torch.Size, str],
-    #     row_stats: torch.Tensor,
-    #     col_stats: torch.Tensor,
-    #     out: Optional[torch.Tensor] = None,
-    #     new_row_stats: Optional[torch.Tensor] = None,
-    #     new_col_stats: Optional[torch.Tensor] = None,
-    #     bias: Optional[torch.Tensor] = None,
-    # ) -> torch.Tensor:
-    #     raise NotImplementedError
-
     @abstractmethod
     def extract_outliers(
         self,
@@ -81,32 +169,30 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        """Quantize tensor A in blocks of 4-bit values.
+
+        Quantizes tensor A by dividing it into blocks which are independently quantized.
+
+        Args:
+            A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+            absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+            out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+            blocksize (`int`, *optional*):
+                The size of the blocks. Defaults to 64.
+                Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+            compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+            quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
+            quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.
+
+        Raises:
+            ValueError: Raised when the input data type is not supported.
+
+        Returns:
+            Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
+            - `torch.Tensor`: The quantized tensor with packed 4-bit values.
+            - [`QuantState`]: The state object used to undo the quantization.
         """
-        Quantize tensor A in blocks of 4-bit values.
-
-        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input tensor.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            The output tensor.
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-        Returns
-        -------
-        torch.Tensor:
-            Tensor with packed 4-bit values.
-        tuple(torch.Tensor, torch.Size, torch.dtype, int):
-            The quantization state to undo the quantization.
-        """
-        raise NotImplementedError
+        ...
 
     @abstractmethod
     def dequantize_4bit(
@@ -118,33 +204,33 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
+        """Dequantizes a packed 4-bit quantized tensor.
+
+        The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+        The the absolute maximum value within these blocks is used for scaling
+        the non-linear dequantization.
+
+        Args:
+            A (`torch.Tensor`): The quantized input tensor.
+            quant_state ([`QuantState`], *optional*):
+                The quantization state as returned by [`quantize_4bit`].
+                Required if `absmax` is not provided.
+            absmax (`torch.Tensor`, *optional*):
+                A tensor containing the scaling values.
+                Required if `quant_state` is not provided and ignored otherwise.
+            out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+            blocksize (`int`, *optional*):
+                The size of the blocks. Defaults to 64.
+                Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+            quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
+
+        Raises:
+            ValueError: Raised when the input data type or blocksize is not supported.
+
+        Returns:
+            `torch.Tensor`: The dequantized tensor.
         """
-        Dequantizes FP4 blockwise quantized values.
-
-        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-        Parameters
-        ----------
-        A : torch.Tensor
-            The input tensor (packed 4-bit values).
-        quant_state : QuantState
-            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
-        absmax : torch.Tensor
-            The absmax values.
-        out : torch.Tensor
-            Dequantized output tensor.
-        blocksize : int
-            The blocksize used in quantization.
-        quant_type : str
-            The 4-bit quantization data type {fp4, nf4}
-
-
-        Returns
-        -------
-        torch.Tensor:
-            Dequantized tensor.
-        """
-        raise NotImplementedError
+        ...
 
     @abstractmethod
     def gemv_4bit(
@@ -155,8 +241,7 @@ def gemv_4bit(
         transposed_A=False,
         transposed_B=False,
         state: QuantState = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError
+    ) -> torch.Tensor: ...
 
     @abstractmethod
     def quantize_blockwise(
@@ -168,7 +253,33 @@ def quantize_blockwise(
         blocksize=4096,
         nested=False,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError
+        """Quantize a tensor in blocks of values.
+
+        The input tensor is quantized by dividing it into blocks of `blocksize` values.
+        The the absolute maximum value within these blocks is calculated for scaling
+        the non-linear quantization.
+
+        Args:
+            A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+            code (`torch.Tensor`, *optional*):
+                A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+                For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+            absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+            out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+            blocksize (`int`, *optional*):
+                The size of the blocks. Defaults to 4096.
+                Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+            nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+
+        Raises:
+            ValueError: Raised when the input data type is not supported.
+
+        Returns:
+            `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
+            - `torch.Tensor`: The quantized tensor.
+            - [`QuantState`]: The state object used to undo the quantization.
+        """
+        ...
 
     @abstractmethod
     def dequantize_blockwise(
@@ -181,7 +292,38 @@ def dequantize_blockwise(
         blocksize: int = 4096,
         nested=False,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        """Dequantize a tensor in blocks of values.
+
+        The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+        The the absolute maximum value within these blocks is used for scaling
+        the non-linear dequantization.
+
+        Args:
+            A (`torch.Tensor`): The quantized input tensor.
+            quant_state ([`QuantState`], *optional*):
+                The quantization state as returned by [`quantize_blockwise`].
+                Required if `absmax` is not provided.
+            absmax (`torch.Tensor`, *optional*):
+                A tensor containing the scaling values.
+                Required if `quant_state` is not provided and ignored otherwise.
+            code (`torch.Tensor`, *optional*):
+                A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+                For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+                Ignored when `quant_state` is provided.
+            out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+            blocksize (`int`, *optional*):
+                The size of the blocks. Defaults to 4096.
+                Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+                Ignored when `quant_state` is provided.
+
+        Raises:
+            ValueError: Raised when the input data type is not supported.
+
+        Returns:
+            `torch.Tensor`:
+                The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
+        """
+        ...
 
     @abstractmethod
     def optimizer_update_8bit_blockwise(
@@ -193,6 +335,8 @@ def optimizer_update_8bit_blockwise(
         state2: Optional[torch.Tensor],
         beta1: float,
         beta2: float,
+        beta3: float,
+        alpha: float,
         eps: float,
         step: int,
         lr: float,
@@ -241,6 +385,8 @@ def optimizer_update_32bit(
         lr: float,
         state2: Optional[torch.Tensor] = None,
         beta2: float = 0.0,
+        beta3: float = 0.0,
+        alpha: float = 0.0,
         weight_decay: float = 0.0,
         gnorm_scale: float = 1.0,
         unorm_vec: Optional[torch.Tensor] = None,

From 07c23de3f738d3f716db1dd6615417875f08e197 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 7 Feb 2025 14:47:00 -0500
Subject: [PATCH 210/234] Update NPU backend with new spec

---
 bitsandbytes/backends/npu.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
index ecbc2f351..d22fe04e8 100644
--- a/bitsandbytes/backends/npu.py
+++ b/bitsandbytes/backends/npu.py
@@ -1,5 +1,5 @@
 import ctypes as ct
-from typing import Literal, Optional, Tuple, Union
+from typing import Literal, Optional, Tuple
 
 import torch
 
@@ -29,7 +29,7 @@ def assert_on_npu(tensors):
 
 
 class NPUBackend(Backend):
-    def double_quant(
+    def int8_double_quant(
         self,
         A: torch.Tensor,
         col_stats: Optional[torch.Tensor] = None,
@@ -37,7 +37,17 @@ def double_quant(
         out_col: Optional[torch.Tensor] = None,
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        raise NotImplementedError
+
+    def int8_vectorwise_dequant(self, A, stats):
+        return super().int8_vectorwise_dequant(A, stats)
+
+    def int8_vectorwise_quant(
+        self,
+        A: torch.Tensor,
+        threshold=0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         raise NotImplementedError
 
     def transform(
@@ -52,27 +62,21 @@ def transform(
     ):
         raise NotImplementedError
 
-    def igemmlt(
+    def int8_linear_matmul(
         self,
         A: torch.Tensor,
         B: torch.Tensor,
-        SA: Tuple[torch.Size, str],
-        SB: Tuple[torch.Size, str],
         out: Optional[torch.Tensor] = None,
-        Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
-    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+    ) -> torch.Tensor:
         raise NotImplementedError
 
-    def mm_dequant(
+    def int8_mm_dequant(
         self,
         A: torch.Tensor,
-        quant_state: Tuple[torch.Size, str],
         row_stats: torch.Tensor,
         col_stats: torch.Tensor,
         out: Optional[torch.Tensor] = None,
-        new_row_stats: Optional[torch.Tensor] = None,
-        new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
@@ -268,6 +272,8 @@ def optimizer_update_8bit_blockwise(
         state2: Optional[torch.Tensor],
         beta1: float,
         beta2: float,
+        beta3: float,
+        alpha: float,
         eps: float,
         step: int,
         lr: float,
@@ -293,6 +299,8 @@ def optimizer_update_32bit(
         lr: float,
         state2: Optional[torch.Tensor] = None,
         beta2: float = 0.0,
+        beta3: float = 0.0,
+        alpha: float = 0.0,
         weight_decay: float = 0.0,
         gnorm_scale: float = 1.0,
         unorm_vec: Optional[torch.Tensor] = None,

From 94d6027747c1ed062ede9f35aa61bb198398f7c5 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 10:12:18 -0500
Subject: [PATCH 211/234] Update CPU tests

---
 bitsandbytes/backends/base.py           |  3 +-
 bitsandbytes/backends/cpu.py            | 41 ++++++++-------
 bitsandbytes/backends/cpu_xpu_common.py | 66 +++++++++++++------------
 bitsandbytes/backends/xpu.py            | 41 ++++++++-------
 tests/test_functional.py                | 43 +++++-----------
 5 files changed, 92 insertions(+), 102 deletions(-)

diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
index 818897ed2..69fabe1c6 100644
--- a/bitsandbytes/backends/base.py
+++ b/bitsandbytes/backends/base.py
@@ -113,7 +113,8 @@ def int8_vectorwise_dequant(self, A: torch.Tensor, stats: torch.Tensor):
         Returns:
             `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
         """
-        ...
+        # To dequantize we divide by 127, or multiply by the reciprocal.
+        return A * stats.view(-1, 1) * 7.874015718698502e-3
 
     @abstractmethod
     def int8_vectorwise_quant(self, A: torch.Tensor, threshold=0.0):
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 5d38171d5..aa7249309 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -1,4 +1,4 @@
-from typing import Literal, Optional, Tuple, Union
+from typing import Literal, Optional, Tuple
 
 import torch
 
@@ -9,8 +9,8 @@
     dequantize_4bit_impl,
     double_quant_impl,
     gemm_4bit_impl,
-    igemmlt_impl,
-    mm_dequant_impl,
+    int8_linear_matmul_impl,
+    int8_mm_dequant_impl,
     quantize_4bit_impl,
 )
 
@@ -35,7 +35,7 @@ class CPUBackend(Backend):
     mm_dequant_compute_dtype = torch.bfloat16
     mm_dequant_output_dtype = torch.bfloat16
 
-    def double_quant(
+    def int8_double_quant(
         self,
         A: torch.Tensor,
         col_stats: Optional[torch.Tensor] = None,
@@ -43,7 +43,7 @@ def double_quant(
         out_col: Optional[torch.Tensor] = None,
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
         return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
 
@@ -75,44 +75,43 @@ def transform(
                 out = A
         return out, state
 
-    def igemmlt(
+    def int8_linear_matmul(
         self,
         A: torch.Tensor,
         B: torch.Tensor,
-        SA: Tuple[torch.Size, str],
-        SB: Tuple[torch.Size, str],
         out: Optional[torch.Tensor] = None,
-        Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
-    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+    ) -> torch.Tensor:
         assert_on_cpu([A, B])
-        return igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        return int8_linear_matmul_impl(A, B, out, dtype)
 
-    def mm_dequant(
+    def int8_mm_dequant(
         self,
         A: torch.Tensor,
-        quant_state: Tuple[torch.Size, str],
         row_stats: torch.Tensor,
         col_stats: torch.Tensor,
         out: Optional[torch.Tensor] = None,
-        new_row_stats: Optional[torch.Tensor] = None,
-        new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert_on_cpu([A, row_stats, col_stats, out, bias])
-        return mm_dequant_impl(
+        return int8_mm_dequant_impl(
             A,
-            quant_state,
             row_stats,
             col_stats,
             out,
-            new_row_stats,
-            new_col_stats,
             bias,
             self.mm_dequant_compute_dtype,
             self.mm_dequant_output_dtype,
         )
 
+    def int8_vectorwise_dequant(self, A, stats):
+        return super().int8_vectorwise_dequant(A, stats)
+
+    def int8_vectorwise_quant(self, A: torch.Tensor, threshold=0.0):
+        # TODO: We can optimize this as we don't actually need column-wise quant.
+        out, _, stats, _, outlier_cols = self.int8_double_quant(A, threshold=threshold)
+        return out, stats, outlier_cols
+
     def extract_outliers(
         self,
         A: torch.Tensor,
@@ -202,6 +201,8 @@ def optimizer_update_8bit_blockwise(
         state2: Optional[torch.Tensor],
         beta1: float,
         beta2: float,
+        beta3: float,
+        alpha: float,
         eps: float,
         step: int,
         lr: float,
@@ -227,6 +228,8 @@ def optimizer_update_32bit(
         lr: float,
         state2: Optional[torch.Tensor] = None,
         beta2: float = 0.0,
+        beta3: float = 0.0,
+        alpha: float = 0.0,
         weight_decay: float = 0.0,
         gnorm_scale: float = 1.0,
         unorm_vec: Optional[torch.Tensor] = None,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 75f647939..5987f4f61 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -22,6 +22,7 @@
 except BaseException:
     ipex_cpu = None
     ipex_xpu = None
+    ipex_cpu_only = None
 
 
 gxx_available = False
@@ -88,7 +89,6 @@ def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=N
         A tuple of output quantized per row, output quantized per column, absolute max values of
         each row of A, absolute max values of each column of A, outliers in COO format
     """
-    from ..functional import COOSparseTensor
 
     cols = A.shape[-1]
     if len(A.shape) == 3:
@@ -98,8 +98,6 @@ def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=N
         rows = A.shape[0]
     A = A.reshape(rows, cols)
 
-    coo_tensor = None
-
     def get_row_col_stats(A):
         row_stats = torch.max(torch.abs(A), 1).values  # absolute max of each row
         col_stats = torch.max(torch.abs(A), 0).values  # absolute max of each col
@@ -111,15 +109,20 @@ def quant_to_int8(A, stats):
     if threshold == 0.0:
         if row_stats is None or col_stats is None:
             row_stats, col_stats = get_row_col_stats(A)
+        outlier_cols = None
     else:
         outlier_indices = torch.abs(A) >= threshold  # find outliers
-        outlier_coord = outlier_indices.nonzero()  # get outlier coordinates
-        outlier_rows = outlier_coord[:, 0]  # outlier row for COO sparse tensor
-        outlier_cols = outlier_coord[:, 1]  # outlier column for COO sparse tensor
-        outlier_values = A[outlier_indices]  # outlier values for COO sparse tensor
-        coo_tensor = COOSparseTensor(
-            A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
-        )
+        outlier_cols = torch.argwhere(outlier_indices.any(dim=0)).view(-1)
+        outlier_values = A[outlier_indices].clone()
+
+        # outlier_indices = torch.abs(A) >= threshold  # find outliers
+        # outlier_coord = outlier_indices.nonzero()  # get outlier coordinates
+        # outlier_rows = outlier_coord[:, 0]  # outlier row for COO sparse tensor
+        # outlier_cols = outlier_coord[:, 1]  # outlier column for COO sparse tensor
+        # outlier_values = A[outlier_indices]  # outlier values for COO sparse tensor
+        # coo_tensor = COOSparseTensor(
+        #     A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
+        # )
         if row_stats is None or col_stats is None:
             A[outlier_indices] = 0  # zero out outliers
             row_stats, col_stats = get_row_col_stats(A)
@@ -127,9 +130,13 @@ def quant_to_int8(A, stats):
     quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
     quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
 
-    if coo_tensor is not None:
+    if outlier_cols is not None:
         A[outlier_indices] = outlier_values  # restore outliers for later use
 
+        if rows > 1:
+            # zero out outlier columns for all rows
+            quant_by_row[:, outlier_cols] = 0
+
     if out_row is not None:
         out_row.copy_(quant_by_row)
     else:
@@ -139,23 +146,26 @@ def quant_to_int8(A, stats):
     else:
         out_col = quant_by_col
     # Return float stats to align with CUDA impl
-    return out_row, out_col, row_stats.float(), col_stats.float(), coo_tensor
+    return out_row, out_col, row_stats.float(), col_stats.float(), outlier_cols
 
 
-def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
+def int8_linear_matmul_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    dtype=torch.int32,
+) -> torch.Tensor:
     """
     Do GEMMM computation. Data type: int8 * int8 -> int32.
     Args:
         A Activation of linear, data type is int8
         B Weight of linear, data type is int8
-        SA Not used for CPU/XPU
-        SB Not used for CPU/XPU
         out Specified output tensor if it is not None
-        Sout Not used for CPU/XPU but returned as is
         dtype Data type of output
     Return:
         A tuple of GEMM result in dtype and Sout
     """
+
     assert A.dtype == torch.int8
     assert B.dtype == torch.int8
     if out is not None:
@@ -198,33 +208,27 @@ def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32)
     else:
         out = C
 
-    return out, Sout
+    return out
 
 
 @_maybe_torch_compile
-def mm_dequant_impl(
-    A,
-    quant_state,
-    row_stats,
-    col_stats,
-    out=None,
-    new_row_stats=None,
-    new_col_stats=None,
-    bias=None,
+def int8_mm_dequant_impl(
+    A: torch.Tensor,
+    row_stats: torch.Tensor,
+    col_stats: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
     compute_dtype=torch.float32,
     output_dtype=torch.float32,
-):
+) -> torch.Tensor:
     """
     Dequant and add bias
     out = A_int32 * (abs_max_A * abs_max_B) / 127 * 127 + bias
     Args:
         A The output of int8 gemm, whose dtype is int32
-        quant_state Not used for CPU
         row_stats Absolute max value of each row of input (A) of gemm
         col_stats Absolute max value of each row of weight (B) of gemm
         out Output buffer
-        new_row_stats Not used for CPU/XPU
-        new_col_stats Not used for CPU/XPU
         bias Bias of linear
         compute_dtype Data type for computation
         output_dtype Data type for output
@@ -563,7 +567,7 @@ def gemm_4bit_impl(
             state.compensation,
         )
     else:
-        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize).t()
+        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
         output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index aca0a0103..142d98bb6 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -1,4 +1,4 @@
-from typing import Literal, Optional, Tuple, Union
+from typing import Literal, Optional, Tuple
 
 import torch
 
@@ -9,8 +9,8 @@
     dequantize_4bit_impl,
     double_quant_impl,
     gemm_4bit_impl,
-    igemmlt_impl,
-    mm_dequant_impl,
+    int8_linear_matmul_impl,
+    int8_mm_dequant_impl,
     quantize_4bit_impl,
 )
 
@@ -35,7 +35,7 @@ class XPUBackend(Backend):
     mm_dequant_compute_dtype = torch.bfloat16
     mm_dequant_output_dtype = torch.bfloat16
 
-    def double_quant(
+    def int8_double_quant(
         self,
         A: torch.Tensor,
         col_stats: Optional[torch.Tensor] = None,
@@ -43,7 +43,7 @@ def double_quant(
         out_col: Optional[torch.Tensor] = None,
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         assert_on_xpu([A, col_stats, row_stats, out_col, out_row])
         output = double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
         return output
@@ -76,46 +76,45 @@ def transform(
                 out = A
         return out, state
 
-    def igemmlt(
+    def int8_linear_matmul(
         self,
         A: torch.Tensor,
         B: torch.Tensor,
-        SA: Tuple[torch.Size, str],
-        SB: Tuple[torch.Size, str],
         out: Optional[torch.Tensor] = None,
-        Sout: Optional[Tuple[torch.Size, str]] = None,
         dtype=torch.int32,
-    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+    ) -> torch.Tensor:
         assert_on_xpu([A, B])
-        output = igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+        output = int8_linear_matmul_impl(A, B, out, dtype)
         return output
 
-    def mm_dequant(
+    def int8_mm_dequant(
         self,
         A: torch.Tensor,
-        quant_state: Tuple[torch.Size, str],
         row_stats: torch.Tensor,
         col_stats: torch.Tensor,
         out: Optional[torch.Tensor] = None,
-        new_row_stats: Optional[torch.Tensor] = None,
-        new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert_on_xpu([A, row_stats, col_stats, out, bias])
-        output = mm_dequant_impl(
+        output = int8_mm_dequant_impl(
             A,
-            quant_state,
             row_stats,
             col_stats,
             out,
-            new_row_stats,
-            new_col_stats,
             bias,
             self.mm_dequant_compute_dtype,
             self.mm_dequant_output_dtype,
         )
         return output
 
+    def int8_vectorwise_dequant(self, A, stats):
+        return super().int8_vectorwise_dequant(A, stats)
+
+    def int8_vectorwise_quant(self, A: torch.Tensor, threshold=0.0):
+        # TODO: We can optimize this as we don't actually need column-wise quant.
+        out, _, stats, _, outlier_cols = self.int8_double_quant(A, threshold=threshold)
+        return out, stats, outlier_cols
+
     def extract_outliers(
         self,
         A: torch.Tensor,
@@ -209,6 +208,8 @@ def optimizer_update_8bit_blockwise(
         state2: Optional[torch.Tensor],
         beta1: float,
         beta2: float,
+        beta3: float,
+        alpha: float,
         eps: float,
         step: int,
         lr: float,
@@ -234,6 +235,8 @@ def optimizer_update_32bit(
         lr: float,
         state2: Optional[torch.Tensor] = None,
         beta2: float = 0.0,
+        beta3: float = 0.0,
+        alpha: float = 0.0,
         weight_decay: float = 0.0,
         gnorm_scale: float = 1.0,
         unorm_vec: Optional[torch.Tensor] = None,
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 83ee77e50..6e3de18c0 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -516,8 +516,8 @@ def test_dequant_mm(dim1, dim4, dims, has_bias):
         assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n))
 
 
-@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dim1", [64, 256], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim4", [64, 1024], ids=id_formatter("dim4"))
 @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
 @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
 def test_dequant_mm_cpu(dim1, dim4, dims, has_bias):
@@ -532,14 +532,14 @@ def test_dequant_mm_cpu(dim1, dim4, dims, has_bias):
         A1, maxA = F.vectorwise_quant(A, dim=1)
         B1, maxB = F.vectorwise_quant(B, dim=1)
 
-        C2, SC = F.igemmlt(A1, B1, SA=None, SB=None)
-        assert SC is None
+        C2 = F.int8_linear_matmul(A1, B1)
 
         C3 = F.vectorwise_mm_dequant(C2.bfloat16(), maxA, maxB.t())
         if has_bias:
             C3 += bias
 
-        C4 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias)
+        C4 = F.int8_mm_dequant(C2, maxA.flatten(), maxB.flatten(), bias=bias)
+
         torch.testing.assert_close(C3.float(), C4.float(), atol=0.05, rtol=0.1)
 
 
@@ -798,12 +798,12 @@ def test_transform_cpu(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpos
 @pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
 @pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
-def test_coo_double_quant(dim1, dim2, device, dtype):
+def test_coo_int8_vectorwise_quant(dim1, dim2, device, dtype):
     if device == "cuda" and dtype == torch.bfloat16:
         pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
     threshold = 2.00
     for i in range(k):
-        A = torch.randn(dim1, dim2, device="cuda").half()
+        A = torch.randn(dim1, dim2, device=device).to(dtype)
 
         idx = torch.abs(A) >= threshold
         CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
@@ -814,31 +814,10 @@ def test_coo_double_quant(dim1, dim2, device, dtype):
             torch.testing.assert_close(A1, A2)
 
             A[:, outlier_cols] = 0
-            A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
+            A2 = (CA.float() * statsA.unsqueeze(1) / 127).to(dtype)
             torch.testing.assert_close(A, A2, rtol=0.05, atol=1.5e-2)
 
 
-@pytest.mark.parametrize("dim1", [512, 2048], ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
-@pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
-@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
-def test_coo_int8_vectorwise_quant(dim1, dim2, device, dtype):
-    if device == "cuda" and dtype == torch.bfloat16:
-        pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
-
-    threshold = 2.00
-    for i in range(k):
-        A = torch.randn(dim1, dim2, device=device).to(dtype)
-
-        idx = torch.abs(A) >= threshold
-        CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
-
-        if outlier_cols is not None:
-            A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
-            A[:, outlier_cols] = 0
-            torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
-
-
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim2"))
@@ -1480,8 +1459,8 @@ def test_4bit_quant(dtype, quant_type, blocksize, device):
     A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
     qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
     A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
-    if device == "cpu":
-        A2 = A2.t()
+    # if device == "cpu":
+    #     A2 = A2.t()
 
     err = (A1 - A2).abs().float()
     relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -1767,7 +1746,7 @@ def test_gemv_4bit_cpu(dtype, quant_type, kind):
                 quant_storage=torch.uint8,
             )
             dqB = F.dequantize_4bit(qB, state)
-            C3 = torch.matmul(A, dqB)
+            C3 = torch.matmul(A, dqB.t())
             C2 = F.gemv_4bit(A, qB.t(), state=state)
             A.requires_grad = True
             C1 = bnb.matmul_4bit(A, qB.t(), state)

From 3fabd1a9571bfd8dd737c451620d55fd5288ca29 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 14:09:47 -0500
Subject: [PATCH 212/234] ROCm: Fix compilation.

---
 csrc/kernels.hip         |  16 +--
 csrc/kernels_hip.cuh     |   6 +-
 csrc/ops.hip             | 269 ++++++++++++++++++---------------------
 csrc/ops_hip.cuh         |  20 +--
 csrc/pythonInterface.cpp |  87 +++----------
 5 files changed, 167 insertions(+), 231 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index d8d7cdba5..b804d7ccc 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -973,7 +973,7 @@ template<typename T, int OPTIMIZER>
 __launch_bounds__(TH, 1)
 __global__ void kOptimizer32bit2State(T* g, T* p,
                 float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay,
                 const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n)
 {
 
@@ -1742,7 +1742,7 @@ template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
 __launch_bounds__(256, 3)
 __global__ void
 kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
-                const float beta1, const float beta2,
+                const float beta1, const float beta2, const float beta3, const float alpha,
                 const float eps, const int step, const float lr,
                 float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                 float* absmax1, float* absmax2,
@@ -2268,7 +2268,7 @@ template __global__ void kgetColRowStats<half, 64, 4, 16, 64*4, 1>(half * __rest
 
 #define MM_DEQUANT_CONST 6.200012e-05f //1.0f/(127.0f*127.0f)
 
-template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, float* newRowStats, float* newcolStats, half *__restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n)
+template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, half *__restrict__ const bias, const int numRows, const int numCols, const int n)
 {
 
   // Strategy: To dequantize we need to load col/row statistics. This can be very expensive
@@ -3851,7 +3851,7 @@ template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_TURING>(
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 
-template __global__ void kdequant_mm_int32_fp16<4, 128, 512>(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
+template __global__ void kdequant_mm_int32_fp16<4, 128, 512>(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, half * __restrict__ const bias, const int numRows, const int numCols, const int n);
 
 template __global__ void kDoubleRowColQuant<64, 4, 16, 64*4, 0>(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
 template __global__ void kDoubleRowColQuant<64, 4, 16, 64*4, 1>(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
@@ -3903,11 +3903,11 @@ MAKE_PreconditionOptimizer32bit2State(ADAM, half)
 MAKE_PreconditionOptimizer32bit2State(ADAM, hip_bfloat16)
 
 template __global__ void kOptimizer32bit2State<float, ADAM>(float* g, float* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+    const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 template __global__ void kOptimizer32bit2State<half, ADAM>(half* g, half* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+    const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 template __global__ void kOptimizer32bit2State<hip_bfloat16, ADAM>(hip_bfloat16* g, hip_bfloat16* p, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-    const float beta1, const float beta2, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+    const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay,const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 
 
 #define MAKE_PreconditionStatic8bit1State(oname, gtype) \
@@ -4068,7 +4068,7 @@ template __global__ void kDequantizeBlockwise<hip_bfloat16, 512, 64, 8, NF4>(flo
 
 #define MAKE_OptimizerStatic8bit2StateBlockwise(oname, gtype, block_size, num_per_thread) \
 template __global__ void kOptimizerStatic8bit2StateBlockwise<gtype, oname, block_size, num_per_thread>(gtype* p, gtype* __restrict__ const g, unsigned char* state1, unsigned char* state2, \
-                const float beta1, const float beta2, \
+                const float beta1, const float beta2, const float beta3, const float alpha, \
                 const float eps, const int step, const float lr, \
                 float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, \
                 float* absmax1, float* absmax2,  \
diff --git a/csrc/kernels_hip.cuh b/csrc/kernels_hip.cuh
index 430218736..6768302f9 100644
--- a/csrc/kernels_hip.cuh
+++ b/csrc/kernels_hip.cuh
@@ -30,7 +30,7 @@ __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
 template<typename T, int OPTIMIZER>
 __global__ void kOptimizer32bit2State(T* g, T* p,
                 float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay,
                 const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 
 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
@@ -92,7 +92,7 @@ kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned cha
 
 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
 		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
-                const float beta1, const float beta2, const float eps, const int step, const float lr,
+                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const int step, const float lr,
                 float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                 float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
 
@@ -116,7 +116,7 @@ template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_s
 
 template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
   int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
-  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
+  half *out,  half * __restrict__ const bias, const int numRows, const int numCols, const int n);
 
 template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
 template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 4fdc3cbfa..8398a6171 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -50,11 +50,11 @@ void quantize(float *code, float *A, unsigned char *out, int n)
   CUDA_CHECK_RETURN(hipPeekAtLastError());
 }
 
-void dequantize(float *code, unsigned char *A, float *out, int n)
+void dequantize(float *code, unsigned char *A, float *out, int n, hipStream_t stream)
 {
   int num_blocks = n/1024;
   num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
- hipLaunchKernelGGL(( kDequantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
+ hipLaunchKernelGGL(( kDequantize), dim3(num_blocks), dim3(1024), 0, stream, code, A, out, n);
   CUDA_CHECK_RETURN(hipPeekAtLastError());
 }
 
@@ -82,16 +82,16 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
   CUDA_CHECK_RETURN(hipPeekAtLastError());
 }
 
-template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n)
+template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
 {
   int num_blocks = n/blocksize;
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
   int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
 
   if(DATA_TYPE > 0)
-   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, 0, code, A, absmax, out, blocksize/2, n);
+   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize/2, n);
   else
-   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, 0, code, A, absmax, out, blocksize, n);
+   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);
 
   CUDA_CHECK_RETURN(hipPeekAtLastError());
 }
@@ -107,7 +107,8 @@ template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsign
 
 template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
                 float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const float beta1, const float beta2, const float beta3, const float alpha,
+                const float eps, const float weight_decay,
                 const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n)
 {
   int num_blocks = n/4096;
@@ -121,9 +122,12 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
        hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
         CUDA_CHECK_RETURN(hipPeekAtLastError());
       }
-		hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
+		hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
       CUDA_CHECK_RETURN(hipPeekAtLastError());
 			break;
+    case ADEMAMIX:
+      // TODO: Not implemented!
+      break;
 		case MOMENTUM:
     case RMSPROP:
     case ADAGRAD:
@@ -209,7 +213,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
 #define NUM_1STATE 8
 
 template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr,
                 float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)
 {
 
@@ -219,10 +223,11 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g
 		case ADAM:
 			num_blocks = n/BLOCKSIZE_2STATE;
 			num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
-		hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, eps, step, lr,
+		hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
 																														quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
 			CUDA_CHECK_RETURN(hipPeekAtLastError());
 		break;
+    case ADEMAMIX: break; // TODO: Not implemented!
 		case MOMENTUM:
 		case RMSPROP:
     case ADAGRAD:
@@ -509,88 +514,42 @@ static std::string hipError_to_string(const hipError_t ret)
 	}
 }
 
-template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream)
 {
 #ifdef NO_HIPBLASLT
 	return ERR_NOT_IMPLEMENTED;
 #else
-    int has_error = 0;
-    hipblasLtMatmulDesc_t matmulDesc = NULL;
-    hipblasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    hipblasOperation_t opT = HIPBLAS_OP_T;
-    //hipblasLtPointerMode_t alphaVec = hipblasLt_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-    hipblasLtOrder_t col32 = HIPBLASLT_ORDER_COL;
-    hipblasLtOrder_t col_turing = HIPBLASLT_ORDER_COL;
-    hipblasLtOrder_t col_ampere = HIPBLASLT_ORDER_COL;
-
-    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Adesc, HIP_R_8I, m, k, lda));
-    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Bdesc, HIP_R_8I, n, k, ldb));
-    has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Adesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
-
-
-    if(FORMATB == COL_TURING)
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
-    else
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Bdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
 
-    const int64_t max_workspace_size = 0;//set to 0 to avoid choosing GSU kernel
+  // Calculate C = A^T @ B, in col-major layout.
+  //
+  // Use the IMMA kernels requires:
+  // * A must be transposed and B must be non-transposed.
+  // * Dimensions m and k must be multiples of 4.
+  // * All pointers must be 4-byte aligned; 16-byte alignment preferred.
 
-    if(DTYPE_OUT == 32)
-    {
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_32I));
-      auto opA = HIPBLAS_OP_N;
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(int32_t)));
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(int32_t)));
-      hipblasLtEpilogue_t epilogue = HIPBLASLT_EPILOGUE_DEFAULT;
-      checkHipblasStatus(hipblasLtMatmulDescSetAttribute(
-        matmulDesc, HIPBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIP_R_32I, m, n, ldc));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
-      int alpha = 1, beta = 0;
+  int has_error = 0;
+  const int64_t max_workspace_size = 0;//set to 0 to avoid choosing GSU kernel
 
+  hipblasLtMatmulDesc_t matmulDesc;
+  hipblasLtMatrixLayout_t aDesc, bDesc, cDesc;
+  hipblasOperation_t opT = HIPBLAS_OP_T;
 
-      /* Algo and workspace TODO: need to rework to not be duplicated */
-      // Set User Preference attributes
-      hipblasLtMatmulPreference_t pref;
-      checkHipblasStatus(hipblasLtMatmulPreferenceCreate(&pref));
-      checkHipblasStatus(
-          hipblasLtMatmulPreferenceSetAttribute(pref,
-                                                HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                                                &max_workspace_size,
-                                                sizeof(max_workspace_size)));
+  hipDataType outType = DTYPE_OUT == 32 ? HIP_R_32I : HIP_R_8I;
+  hipDataType scaleType = DTYPE_OUT == 32 ? HIP_R_32I : HIP_R_32F;
 
-      const int                        request_solutions = 1;
-      hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions];
-      int                              returnedAlgoCount = 0;
-      checkHipblasStatus(hipblasLtMatmulAlgoGetHeuristic(ltHandle,
-                                                            matmulDesc,
-                                                            Adesc,
-                                                            Bdesc,
-                                                            Cdesc,
-                                                            Cdesc,
-                                                            pref,
-                                                            request_solutions,
-                                                            heuristicResult,
-                                                            &returnedAlgoCount));
+  hipblasLtPointerMode_t pointerMode = HIPBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST;
+
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&aDesc, HIP_R_8I, m, k, lda));
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&bDesc, HIP_R_8I, m, n, ldb));
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&cDesc, outType, k, n, ldc));
+
+  // Default layout order is col major
+
+  has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, scaleType));
+  has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opT, sizeof(opT)));
+
+  if (DTYPE_OUT == 32) {
 
-      if (returnedAlgoCount == 0)
-      {
-        has_error = 1;
-        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
-      }
-      else
-      {
-        has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
-      }
-    }
-    else
-    {
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, HIP_R_8I));
-      hipblasOperation_t opA = HIPBLAS_OP_N;
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opA, sizeof(opA)));
-      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&Cdesc, HIP_R_8I, m, n, ldc));
-      has_error |= checkHipblasStatus(hipblasLtMatrixLayoutSetAttribute(Cdesc, HIPBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
       /* Algo and workspace TODO: need to rework to not be duplicated */
       // Set User Preference attributes
       hipblasLtMatmulPreference_t pref;
@@ -606,10 +565,10 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       int                              returnedAlgoCount = 0;
       checkHipblasStatus(hipblasLtMatmulAlgoGetHeuristic(ltHandle,
                                                             matmulDesc,
-                                                            Adesc,
-                                                            Bdesc,
-                                                            Cdesc,
-                                                            Cdesc,
+                                                            aDesc,
+                                                            bDesc,
+                                                            cDesc,
+                                                            cDesc,
                                                             pref,
                                                             request_solutions,
                                                             heuristicResult,
@@ -619,33 +578,59 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       {
         has_error = 1;
         fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
+      } else {
+        int alpha = 1, beta = 0;
+        has_error |= checkHipblasStatus(hipblasLtMatmul(
+          ltHandle, matmulDesc,
+          &alpha, A, aDesc,
+          B, bDesc, &beta,
+          (int32_t*)C, cDesc,
+          (int32_t*)C, cDesc,
+          &heuristicResult[0].algo, NULL, 0, stream
+        ));
       }
-      else
-      {
-        if(!SCALE_ROWS)
-        {
-          float alpha = 1.0f, beta = 0.0f;
-
-          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
-        }
-        else
-        {
-          float beta = 0.0f;
-
-          has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
-        }
-      }
+  } else {
+    // This path is unlikely to be used, as 8-bit accumulation can lead to likely overflows.
+
+    if (!SCALE_ROWS) {
+      float alpha = 1.0f, beta = 0.0f;
+      has_error |= checkHipblasStatus(hipblasLtMatmul(
+        ltHandle, matmulDesc,
+        &alpha, A, aDesc,
+        B, bDesc, &beta,
+        (int8_t*)C, cDesc,
+        (int8_t*)C, cDesc,
+        NULL, NULL, 0, stream
+      ));
+    } else {
+      hipblasLtPointerMode_t alphaVec = HIPBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST;
+      float beta = 0.0f;
+      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(
+        matmulDesc,
+        HIPBLASLT_MATMUL_DESC_POINTER_MODE,
+        &pointerMode,
+        sizeof(alphaVec)
+      ));
+      has_error |= checkHipblasStatus(hipblasLtMatmul(
+        ltHandle, matmulDesc,
+        row_scale, A, aDesc,
+        B, bDesc, &beta,
+        (int8_t*)C, cDesc,
+        (int8_t*)C, cDesc,
+        NULL, NULL, 0, stream
+      ));
     }
+  }
 
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(cDesc));
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(bDesc));
+  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(aDesc));
+  has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
 
-    if (Cdesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Cdesc));
-    if (Bdesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Bdesc));
-    if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
-    if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
-    if(has_error == 1)
-      fprintf(stderr, "error detected\n");
+  if(has_error == 1)
+    printf("error detected");
 
-    return has_error;
+  return has_error;
 #endif // NO_HIPBLASLT
 }
 
@@ -654,23 +639,15 @@ int fill_up_to_nearest_multiple(int value, int multiple)
   return value + (value % multiple == 0 ? 0 : (multiple - (value % multiple)));
 }
 
-void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half *bias, int numRows, int numCols)
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half *bias, int numRows, int numCols, hipStream_t stream)
 {
-  int threads = 512;
-  //int tileCols = fill_up_to_nearest_multiple(numCols, 32);
-  //int n = numRows*tileCols;
-  int tileCols = numCols;
-  int n = numRows*numCols;
-  //int subtile_rows = 128;
-  //int tilesize = 32*subtile_rows;
-  //int num_blocks = numRows/subtile_rows;
-  //num_blocks += (numRows % subtile_rows == 0) ? 0 : 1;
-  //num_blocks = num_blocks*(tileCols/32);
-  //assert(threads <= tilesize);
-  int num_blocks = numRows * numCols / (threads * 4);
-  num_blocks += (numRows * numCols) % (threads * 4) == 0 ? 0 : 1;
-
- hipLaunchKernelGGL(( kdequant_mm_int32_fp16<4, 128, 512>), dim3(num_blocks), dim3(threads), 0, 0, A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols, tileCols, n);
+  const int threads = 512;
+  const int num_per_thread = 4;
+  const int num_per_block = threads * num_per_thread;
+  const int n = numRows*numCols;
+  const int num_blocks = (n + num_per_block - 1) / num_per_block;
+
+ hipLaunchKernelGGL(( kdequant_mm_int32_fp16<4, 128, 512>), dim3(num_blocks), dim3(threads), 0, stream, A, rowStats, colStats, out, bias, numRows, numCols, n);
   CUDA_CHECK_RETURN(hipPeekAtLastError());
 }
 
@@ -939,12 +916,9 @@ template void extractOutliers<COL_AMPERE>(char * A, int *idx, char *out, int idx
 template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 
-template int igemmlt<COL_TURING, 32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_TURING, 8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_TURING, 8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<COL_AMPERE, 8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
+template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
+template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
 
 template void transformRowToFormat<COL32, 0>(char * A, char *out, int rows, int cols);
 template void transformRowToFormat<COL32, 1>(char * A, char *out, int rows, int cols);
@@ -969,20 +943,20 @@ template void quantizeBlockwise<hip_bfloat16, 0, General8bit>(float * code, hip_
 template void quantizeBlockwise<hip_bfloat16, 0, FP4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template void quantizeBlockwise<hip_bfloat16, 0, NF4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 
-template void dequantizeBlockwise<float, General8bit>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
-template void dequantizeBlockwise<float, FP4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
-template void dequantizeBlockwise<float, NF4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
-template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
-template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
-template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
-template void dequantizeBlockwise<hip_bfloat16, General8bit>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
-template void dequantizeBlockwise<hip_bfloat16, FP4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
-template void dequantizeBlockwise<hip_bfloat16, NF4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n);
+template void dequantizeBlockwise<float, General8bit>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<float, FP4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<float, NF4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<hip_bfloat16, General8bit>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<hip_bfloat16, FP4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);
+template void dequantizeBlockwise<hip_bfloat16, NF4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);
 
 #define MAKE_optimizer32bit(name, gtype) \
 template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
                 float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
-                const float beta1, const float beta2, const float eps, const float weight_decay, \
+                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay, \
                 const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
 
 MAKE_optimizer32bit(ADAM, half)
@@ -990,13 +964,19 @@ MAKE_optimizer32bit(ADAM, float)
 MAKE_optimizer32bit(ADAM, hip_bfloat16)
 MAKE_optimizer32bit(MOMENTUM, half)
 MAKE_optimizer32bit(MOMENTUM, float)
+MAKE_optimizer32bit(MOMENTUM, hip_bfloat16)
 MAKE_optimizer32bit(RMSPROP, half)
 MAKE_optimizer32bit(RMSPROP, float)
+MAKE_optimizer32bit(RMSPROP, hip_bfloat16)
 MAKE_optimizer32bit(LION, half)
 MAKE_optimizer32bit(LION, float)
 MAKE_optimizer32bit(LION, hip_bfloat16)
 MAKE_optimizer32bit(ADAGRAD, half)
 MAKE_optimizer32bit(ADAGRAD, float)
+MAKE_optimizer32bit(ADAGRAD, hip_bfloat16)
+MAKE_optimizer32bit(ADEMAMIX, half)
+MAKE_optimizer32bit(ADEMAMIX, hip_bfloat16)
+MAKE_optimizer32bit(ADEMAMIX, float)
 
 #define MAKE_optimizerStatic8bit(name, gtype) \
 template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
@@ -1019,22 +999,27 @@ MAKE_optimizerStatic8bit(LION, float)
 
 #define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
 template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr,  \
                 float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \
 
 MAKE_optimizerStatic8bitBlockwise(half, ADAM);
 MAKE_optimizerStatic8bitBlockwise(float, ADAM);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
 MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
 MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, MOMENTUM);
 MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, RMSPROP);
 MAKE_optimizerStatic8bitBlockwise(half, LION);
 MAKE_optimizerStatic8bitBlockwise(float, LION);
 MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
 MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
 MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAGRAD);
+MAKE_optimizerStatic8bitBlockwise(half, ADEMAMIX);
+MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADEMAMIX);
+MAKE_optimizerStatic8bitBlockwise(float, ADEMAMIX);
 
 template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
 template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
-
-MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index e57cbb3b5..bc7b61e08 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -73,6 +73,7 @@ typedef enum Optimizer_t
   LARS = 3,
   ADAGRAD = 4,
   LION = 5,
+  ADEMAMIX = 6,
 } Optimizer_t;
 
 typedef enum Transform_t
@@ -143,13 +144,13 @@ class ContextHipsparse
 template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n);
 
 void quantize(float *code, float *A, unsigned char *out, int n);
-void dequantize(float *code, unsigned char *A, float *out, int n);
+void dequantize(float *code, unsigned char *A, float *out, int n, hipStream_t stream);
 template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
-template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
+template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n, hipStream_t stream);
 
 template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
                 float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
-                float beta1, float beta2, float eps, float weight_decay,
+                float beta1, float beta2, float beta3, float alpha, float eps, float weight_decay,
                 int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
 
 template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
@@ -162,7 +163,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigne
                 const float gnorm_scale, int n);
 
 template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr,
                 float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
 								bool skip_zeros, int n);
 
@@ -175,14 +176,13 @@ void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, i
                     long long int strideA, long long int strideB, long long int strideC, int batchCount);
 
 
-template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
 
 template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(hipblasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
 void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
-void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols);
-void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols);
-void doubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed,
-                       int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols);
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols, hipStream_t stream);
+void getRowStats(half * A, float *rowStats, float threshold, int rows, int cols, hipStream_t stream);
+void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, hipStream_t stream);
 
 template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *out, int rows, int cols);
 
@@ -196,7 +196,7 @@ void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rows
 
 template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits);
 template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
-template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
+template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream);
 
 template <typename T, int FUNC> void func(T *A, T *B, T value, long n);
 
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index c00d18ffd..4b5e89ad8 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -17,6 +17,13 @@
 #endif
 #include <cpu_ops.h>
 
+// Compatibility between HIP/CUDA APIs
+#if BUILD_HIP
+#define cudaStream_t        hipStream_t
+#define __nv_bfloat16       hip_bfloat16
+#define cublasLtHandle_t    hipblasLtHandle_t
+#endif
+
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
 // We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
 // maintain all that boilerplate
@@ -241,7 +248,6 @@ void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRo
 void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
 void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
 
-#if defined(BUILD_CUDA)
 int igemmlt_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
     return igemmlt<32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 }
@@ -251,27 +257,6 @@ int igemmlt_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, c
 int igemmlt_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
     return igemmlt<8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 }
-#endif
-
-#if BUILD_HIP
- int igemmlt_turing_32(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8_rowscale(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_32(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8_rowscale(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-#endif
 
 void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
 { spmm_coo_very_sparse_naive<half, 16>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
@@ -437,56 +422,23 @@ extern "C"
 	ContextHipsparse *get_hipsparse(){ return new ContextHipsparse(); }
 #endif
 
-#if BUILD_CUDA
-	int cigemmlt_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
-		return igemmlt_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
-	}
-	int cigemmlt_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
-		return igemmlt_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
-	}
-	int cigemmlt_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc,  cudaStream_t stream) {
-		return igemmlt_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
-	}
-
-  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
+int cigemmlt_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+	return igemmlt_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
+}
+int cigemmlt_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+	return igemmlt_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
+}
+int cigemmlt_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc,  cudaStream_t stream) {
+	return igemmlt_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
+}
+#define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
 	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
 	{ \
 		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
 	} \
 
-#endif
-
-#if BUILD_HIP
-	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-	//{ (hipblasLtHandle_t)context->m_handle; return 0; }
-	//{ return 0; }//igemmlt_turing_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8_rowscale((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_32((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8_rowscale((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8((hipblasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
-	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
-	{ \
-		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((hipblasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
-	} \
-
-
-#endif
-
 
+	#if defined(BUILD_CUDA)
 	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
@@ -495,8 +447,7 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
-
-	#if defined(BUILD_HIP)
+	#elifif defined(BUILD_HIP)
 	MAKE_FUNC_CTRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8)
 	MAKE_FUNC_CTRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32)
 	MAKE_FUNC_CTRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32)

From d3ead1eb685599bbc2f619bf6df3242ae1602ebc Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 14:17:06 -0500
Subject: [PATCH 213/234] Fix

---
 csrc/pythonInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
index 4b5e89ad8..489151a87 100644
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -447,7 +447,7 @@ int cigemmlt_8_rowscale(Context *context, int m, int n, int k, const int8_t *A,
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
-	#elifif defined(BUILD_HIP)
+	#elif defined(BUILD_HIP)
 	MAKE_FUNC_CTRANSFORM(8, row, col, t, int8_t, ROW, COL, true, 8)
 	MAKE_FUNC_CTRANSFORM(32, row, col, n, int32_t, ROW, COL, false, 32)
 	MAKE_FUNC_CTRANSFORM(32, row, col, t, int32_t, ROW, COL, true, 32)

From 6c4d8789d7cfdafacd6b648e1cec98e1573c7433 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 15:10:50 -0500
Subject: [PATCH 214/234] Build: use setuptools_scm for dynamic versioning
 compatibility with pyproject.toml

---
 .github/workflows/python-package.yml |  3 +-
 pyproject.toml                       |  8 +++--
 setup.py                             | 46 ++--------------------------
 3 files changed, 9 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 2c460db5f..ecaaad7ec 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -192,8 +192,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: pip
       - run: pip install build wheel
-      # for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
-      - run: python -m build --sdist && python -m build --wheel
+      - run: python -m build .
       - name: Determine and Set Platform Tag, then Tag Wheel
         shell: bash
         run: |
diff --git a/pyproject.toml b/pyproject.toml
index 515d90385..93d7b1829 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools >= 63.0.0"]
+requires = ["setuptools >= 63.0.0", "setuptools_scm >= 8.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -78,7 +78,11 @@ package-data = { "*" = ["libbitsandbytes*.*"] }
 include = ["bitsandbytes*"]
 
 [tool.setuptools.dynamic]
-version = {attr = "bitsandbytes.__version__"}
+#version = {attr = "bitsandbytes.__version__"}
+
+[tool.setuptools_scm]
+local_scheme = "no-local-version"
+version_file = "bitsandbytes/_version.py"
 
 [tool.pytest.ini_options]
 addopts = "-rP"
diff --git a/setup.py b/setup.py
index d76ee1866..d76c0e3f8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,53 +1,11 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import glob
-import os
-import subprocess
-
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
 
-libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
-libs = [os.path.basename(p) for p in libs]
-print("libs:", libs)
-
-
-def get_git_commit_hash():
-    return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip()
-
-
-def is_git_tagged_commit():
-    tags = subprocess.check_output(["git", "tag", "--points-at", "HEAD"]).decode("utf-8").strip()
-    return bool(tags)
-
-
-def get_latest_semver_tag():
-    tags = subprocess.check_output(["git", "tag"], text=True).splitlines()
-    semver_tags = [tag for tag in tags if tag.count(".") == 2 and all(part.isdigit() for part in tag.split("."))]
-    if not semver_tags:
-        print("No valid semantic version tags found, use 1.0.0 defaultly")
-        semver_tags = ["1.0.0"]
-    return sorted(semver_tags, key=lambda s: list(map(int, s.split("."))))[-1]
-
-
-def write_version_file(version, filepath="bitsandbytes/_version.py"):
-    with open(filepath, "w") as f:
-        f.write(f'__version__ = "{version}"\n')
-
-
-def get_version_and_write_to_file():
-    latest_semver_tag = get_latest_semver_tag()
-    version = latest_semver_tag if is_git_tagged_commit() else f"{latest_semver_tag}.dev+{get_git_commit_hash()}"
-    write_version_file(version)
-    return version
-
 
-# Tested with wheel v0.29.0
+# Tested with wheel v0.45.1
 class BinaryDistribution(Distribution):
     def has_ext_modules(self):
         return True
 
 
-setup(version=get_version_and_write_to_file(), packages=find_packages(), distclass=BinaryDistribution)
+setup(packages=find_packages(), distclass=BinaryDistribution)

From 2d06869e97c7e0386d6a763e009059b653c76b27 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 15:40:31 -0500
Subject: [PATCH 215/234] Update wheel build

---
 .github/workflows/python-package.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ecaaad7ec..78472e2a9 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -168,13 +168,13 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-        with:
-          fetch-depth: 1  # shallow clone
-      - name: Fetch tags for dynamic versioning in setup.py
-        run: |
-          git fetch --depth=1 origin --tags
-          echo "Available Git tags:"
-          git tag -n
+      #   with:
+      #     fetch-depth: 1  # shallow clone
+      # - name: Fetch tags for dynamic versioning in setup.py
+      #   run: |
+      #     git fetch --depth=1 origin --tags
+      #     echo "Available Git tags:"
+      #     git tag -n
       - name: Download build artifact
         uses: actions/download-artifact@v4
         with:
@@ -192,7 +192,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: pip
       - run: pip install build wheel
-      - run: python -m build .
+      - run: python -m build . -w
       - name: Determine and Set Platform Tag, then Tag Wheel
         shell: bash
         run: |

From 7c917b0f10d999cafb245da89ef723ef6ae5134a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 16:05:04 -0500
Subject: [PATCH 216/234] Add rocm6.3.2

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 78472e2a9..b0e6105c4 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -109,7 +109,7 @@ jobs:
         os: [ubuntu-latest]
         arch: [x86_64]
         rocm_version:
-          ["6.1.2", "6.2"]
+          ["6.1.2", "6.2.4", "6.3.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4

From fdbbfb6f423ba93a71cf17a1e6811d7543c6954a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 10 Feb 2025 16:45:49 -0500
Subject: [PATCH 217/234] setuptools_scm update

---
 .github/workflows/python-package.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index b0e6105c4..0ccb10880 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -168,6 +168,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Needed for setuptools_scm.
       #   with:
       #     fetch-depth: 1  # shallow clone
       # - name: Fetch tags for dynamic versioning in setup.py

From 89373b8eaa68970adfdff41e28ec1c1310eb742c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 12 Feb 2025 03:59:18 +0800
Subject: [PATCH 218/234] fix xpu woq linear dtype (#1506)

* fix xpu dtypoe

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix nf4 dtype

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 5987f4f61..8c1f30f10 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -552,6 +552,8 @@ def gemm_4bit_impl(
         GEMM output tensor.
     """
     if getattr(state, "ipex", False):
+        # compute_dtype: 1 indicates fp16, 2 indicates bf16
+        compute_dtype = 2 if A.dtype == torch.bfloat16 else 1
         output = torch.ops.torch_ipex.woq_linear(
             A,
             B,
@@ -562,7 +564,7 @@ def gemm_4bit_impl(
             None,
             None,
             state.blocksize,
-            ipex_cpu.quantization.WoqLowpMode.BF16,
+            compute_dtype,
             1,
             state.compensation,
         )

From 26407538de13c306aecd88ea1d233eadb8dcb913 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 20 Feb 2025 23:20:53 +0800
Subject: [PATCH 219/234] fix version (#1532)

* fix version

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix setup version

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 setup.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d76c0e3f8..2263f3bdd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,8 @@
 from setuptools import find_packages, setup
 from setuptools.dist import Distribution
 
+VERSION = "1.0.0"
+
 
 # Tested with wheel v0.45.1
 class BinaryDistribution(Distribution):
@@ -8,4 +10,10 @@ def has_ext_modules(self):
         return True
 
 
-setup(packages=find_packages(), distclass=BinaryDistribution)
+def write_version_file(version, filepath="bitsandbytes/_version.py"):
+    with open(filepath, "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+    return version
+
+
+setup(packages=find_packages(), distclass=BinaryDistribution, version=write_version_file(VERSION))

From c66e1370af4af5be3844ddd2e38d53c45d7913e0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 5 Mar 2025 03:39:19 +0800
Subject: [PATCH 220/234] enable benchmark script (#1554)

* enable benchmark script

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* Small fixes to non_cuda_backends.mdx

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 benchmarking/generation_benchmark.py | 67 ++++++++++++++++++++++++++++
 docs/source/non_cuda_backends.mdx    | 17 ++++---
 2 files changed, 79 insertions(+), 5 deletions(-)
 create mode 100755 benchmarking/generation_benchmark.py

diff --git a/benchmarking/generation_benchmark.py b/benchmarking/generation_benchmark.py
new file mode 100755
index 000000000..a03bf7e83
--- /dev/null
+++ b/benchmarking/generation_benchmark.py
@@ -0,0 +1,67 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--model_name", default="meta-llama/Llama-3.1-8B-Instruct", required=False, type=str, help="model_name"
+)
+parser.add_argument("--quant_type", default="int8", type=str, help="quant type", choices=["int8", "nf4", "fp4"])
+parser.add_argument("--device_map", default="cpu", type=str, help="device_map", choices=["cpu", "xpu", "cuda"])
+args = parser.parse_args()
+
+model_name = args.model_name
+device_map = args.device_map
+if args.quant_type == "int8":
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+else:
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type=args.quant_type,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name, torch_dtype="auto", device_map=device_map, quantization_config=quantization_config
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
+
+output = quantized_model.generate(**input_ids, max_new_tokens=10)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+
+# benchmark the performance
+def benchmark_fn(f, *args, **kwargs):
+    # Manual warmup
+    for _ in range(2):
+        f(*args, **kwargs)
+
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return t0.blocked_autorange().mean
+
+
+MAX_NEW_TOKENS = 100
+
+quantized_model_latency = benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, torch_dtype=torch.bfloat16)
+bf16_model_latency = benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+print(f"bnb model latency: {quantized_model_latency:.3f}")
+print(f"bf16 model latency: {bf16_model_latency:.3f}")
+print(f"BNB vs. bf16 model speed-up: {(bf16_model_latency / quantized_model_latency):.3f}")
+
+print(f"BNB model memory: {(quantized_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(f"bf16 model memory: {(bf16_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(
+    f"BNB vs. bf16 model memory ratio: {(bf16_model.get_memory_footprint() / quantized_model.get_memory_footprint()):.3f}"
+)
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 4c429fb2d..fda78e589 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -27,18 +27,25 @@ Thank you for your support!
 
 ### Intel
 
-The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+The below performance data is collected from the Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+
+You may run `benchmarking/generation_benchmark.py` to reproduce the below model memory and inference results. Please note that you need to bind cores if you are using the CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket.
+
+The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py).
+
+#### Model memory (CPU)
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Memory (GB) | 15.0 | 8.5 | 5.2 | 5.2 |
 
 #### Inference (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
-| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.57x | 2.6x | 0.1x |
 
 #### Fine-Tuning (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
-| Memory (GB) | 40 | 9 | 6.6 | 6.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.91x | 1.0x | 1.0x |

From 83c147dea43cbf41627f8690173ffe3e11dd91f3 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 13 Mar 2025 17:26:38 +0800
Subject: [PATCH 221/234] update comments (#1562)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 8c1f30f10..3f7255e8b 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -428,8 +428,7 @@ def dequantize_4bit_impl(
     quant_type="nf4",
 ) -> Tensor:
     """
-    Dequantizes FP4 blockwise quantized values.
-
+    Dequantizes 4-bit blockwise quantized values.
     Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
 
     Parameters
@@ -445,8 +444,7 @@ def dequantize_4bit_impl(
     blocksize : int
         The blocksize used in quantization.
     quant_type : str
-        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
-
+        The 4-bit quantization data type {fp4, nf4}
 
     Returns
     -------

From 0cd87aaf30219ffc2e5dc0716554d44fc2cba232 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 13 Mar 2025 17:27:09 +0800
Subject: [PATCH 222/234] enable quant storage (#1563)

* enable quant storage

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix to numpy

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu.py            |  3 +--
 bitsandbytes/backends/cpu_xpu_common.py | 14 +++++++++++++-
 bitsandbytes/backends/xpu.py            |  3 +--
 bitsandbytes/nn/modules.py              |  1 +
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index aa7249309..3d99398fc 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -137,8 +137,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_cpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
-        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
 
     def dequantize_4bit(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 3f7255e8b..d478c64cf 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -296,6 +296,7 @@ def quantize_4bit_impl(
     blocksize=64,
     compress_statistics=False,
     quant_type="nf4",
+    quant_storage=torch.uint8,
 ) -> Tensor:
     """
     Quantize tensor A in blocks of 4-bit values.
@@ -314,6 +315,8 @@ def quantize_4bit_impl(
         The blocksize used in quantization.
     quant_type : str
         The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+    quant_storage: torch.dtype
+        We can use bytes to convert storage type.
 
     Returns
     -------
@@ -401,6 +404,10 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
+    if quant_storage != torch.uint8:
+        bytes_value = out.cpu().numpy().tobytes()
+        out = torch.frombuffer(bytes_value, dtype=quant_storage).to(A.device)
+
     return out.reshape(-1, 1), state
 
 
@@ -418,7 +425,8 @@ def dequant_8bit(A, offset, quant_state):
     return absmax
 
 
-@_maybe_torch_compile
+# Compile will fail in torch.frombuffer
+# @_maybe_torch_compile
 def dequantize_4bit_impl(
     A: Tensor,
     quant_state=None,
@@ -453,6 +461,10 @@ def dequantize_4bit_impl(
     """
     transpose = True if A.shape[0] == 1 else False
     A = A.reshape(-1)
+    device = A.device
+    if A.dtype != torch.uint8:
+        bytes_value = A.cpu().numpy().tobytes()
+        A = torch.frombuffer(bytes_value, dtype=torch.uint8).to(device)
 
     if quant_state is None:
         assert absmax is not None and out is not None
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index 142d98bb6..c1c20aa1e 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -138,8 +138,7 @@ def quantize_4bit(
         if blocksize is None:
             blocksize = 64
         assert_on_xpu([A, absmax, out])
-        assert quant_storage == torch.uint8, "XPU backend only supports uint8 quant_storage"
-        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        output = quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type, quant_storage)
         return output
 
     def dequantize_4bit(
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index a48a58414..0434f0e81 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -498,6 +498,7 @@ def set_ipex_linear(self, x: torch.Tensor):
         if (
             (x.device.type in ("cpu", "xpu"))
             and not getattr(self.weight.quant_state, "ipex", False)
+            and self.weight.data.dtype == torch.uint8
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
             and not self.training

From 2354bdd086fe1364c7fe935b7fa375e2b8ae35fd Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 13 Mar 2025 17:28:27 +0800
Subject: [PATCH 223/234] fix meta device dispatch (#1564)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0434f0e81..0ea82575a 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -695,7 +695,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None:
+        if device in ("cuda", "xpu", "cpu"):
             if device.type == "cuda" and self.data.device.type == "cpu":
                 return self.cuda(device)
             elif device.type == "cpu":

From 249a3cd0a51675475b3a3f2e212cd92b4364c579 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 13 Mar 2025 17:43:16 +0800
Subject: [PATCH 224/234] Enable XPU int matmul (#1547)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index d478c64cf..87ffc7360 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -194,8 +194,10 @@ def int8_linear_matmul_impl(
 
     A_reshaped = A.reshape(m, k)
 
-    # torch._int_mm is available on CPU since torch 2.4
-    if _torch_version_prereq(2, 4) and A.device.type == "cpu":
+    # torch._int_mm is available on CPU since torch 2.4, XPU since torch 2.6
+    if (A.device.type == "cpu" and _torch_version_prereq(2, 4)) or (
+        A.device.type == "xpu" and _torch_version_prereq(2, 6)
+    ):
         C = torch._int_mm(A_reshaped, B.T).to(dtype)
     else:
         C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)

From 8fe63259d21fff9387926aa86547414b67060536 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 18 Mar 2025 17:43:40 +0800
Subject: [PATCH 225/234] Fix XPU 4bit (#1567)

* fix 4bit XPU dequant 4bit

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix default value

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix ipex linear set

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix ipex linear set to false when calling state dict

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix Int8Param device patch

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/functional.py | 18 +++++++++---------
 bitsandbytes/nn/modules.py | 11 +++++------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index a76aadb73..2b4a1e246 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1067,7 +1067,7 @@ def dequantize_fp4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
 
@@ -1077,7 +1077,7 @@ def dequantize_nf4(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
     return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
 
@@ -1087,8 +1087,8 @@ def dequantize_4bit(
     quant_state: Optional[QuantState] = None,
     absmax: Optional[torch.Tensor] = None,
     out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
-    quant_type="fp4",
+    blocksize: Optional[int] = None,
+    quant_type: Optional[str] = "fp4",
 ) -> torch.Tensor:
     """Dequantizes a packed 4-bit quantized tensor.
 
@@ -1106,9 +1106,9 @@ def dequantize_4bit(
             Required if `quant_state` is not provided and ignored otherwise.
         out (`torch.Tensor`, *optional*): A tensor to use to store the result.
         blocksize (`int`, *optional*):
-            The size of the blocks. Defaults to 64.
+            The size of the blocks. Defaults to 64 if not HIP_ENVIRONMENT else 128.
             Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
-        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
+        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to "fp4".
 
     Raises:
         ValueError: Raised when the input data type or blocksize is not supported.
@@ -1118,9 +1118,9 @@ def dequantize_4bit(
     """
     ensure_backend_is_available(A.device.type)
     if quant_state is not None:
-        absmax = absmax or quant_state.absmax
-        quant_type = quant_type or quant_state.quant_type
-        blocksize = blocksize or quant_state.blocksize
+        absmax = quant_state.absmax
+        quant_type = quant_state.quant_type
+        blocksize = quant_state.blocksize
     if blocksize is None:
         # Some AMD GPUs have warpsize 64
         # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 0ea82575a..961f746ba 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -487,6 +487,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
                 self.weight.data = reverse_4bit_compress_format(self.weight.data.reshape(1, -1))
 
             self.weight.quant_state.ipex = False
+            self.ipex_linear_is_set = False
 
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
 
@@ -496,15 +497,13 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def set_ipex_linear(self, x: torch.Tensor):
         if (
-            (x.device.type in ("cpu", "xpu"))
-            and not getattr(self.weight.quant_state, "ipex", False)
+            not getattr(self.weight.quant_state, "ipex", False)
             and self.weight.data.dtype == torch.uint8
             and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
             and self.weight.quant_state.quant_type == "nf4"
-            and not self.training
-            and x.requires_grad == False
         ):
-            enable_ipex_fusion(self, x)
+            if x.device.type == "xpu" or (x.device.type == "cpu" and not self.training and x.requires_grad == False):
+                enable_ipex_fusion(self, x)
 
     def forward(self, x: torch.Tensor):
         # Check if ipex fusion can be used
@@ -695,7 +694,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device in ("cuda", "xpu", "cpu"):
+        if device is not None and device.type in ("cuda", "xpu", "cpu"):
             if device.type == "cuda" and self.data.device.type == "cpu":
                 return self.cuda(device)
             elif device.type == "cpu":

From d3658c54819cb4c037edabc89864300c18200575 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 24 Mar 2025 18:57:43 +0800
Subject: [PATCH 226/234] Fix xpu to cpu (#1570)

* fix xpu to cpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix xpu cpu data device

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/nn/modules.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 961f746ba..eb528576d 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -694,32 +694,30 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ("cuda", "xpu", "cpu"):
+        if device is not None:
             if device.type == "cuda" and self.data.device.type == "cpu":
                 return self.cuda(device)
             elif device.type == "cpu":
                 if self.data.dtype == torch.int8:
                     self.CB = self.data
-                    return self
                 else:
                     return self.cpu()
             elif device.type == "xpu":
                 if self.data.dtype == torch.int8:
-                    self.data = self.data.contiguous().xpu(device)
+                    self.data = self.data.contiguous()
                     self.CB = self.data
-                    return self
-                else:
+                if self.data.device.type == "cpu":
                     return self.xpu(device)
-        else:
-            new_param = Int8Params(
-                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
-                requires_grad=self.requires_grad,
-                has_fp16_weights=self.has_fp16_weights,
-            )
-            new_param.CB = self.CB
-            new_param.SCB = self.SCB
 
-            return new_param
+        new_param = Int8Params(
+            super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+        )
+        new_param.CB = self.CB
+        new_param.SCB = self.SCB
+
+        return new_param
 
 
 def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):

From d180d8e87b1cb19eccd2d73006e750ee3f5b3b1e Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 2 Apr 2025 22:23:04 +0800
Subject: [PATCH 227/234] fix double compress 8bit precision (#1582)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu_xpu_common.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 87ffc7360..2a29604ba 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -369,8 +369,9 @@ def quantize_4bit_impl(
             out_uint8[abs_scaled_A > key] = val
         out_uint8 += sign.to(torch.uint8) * 8
     elif quant_type == "int8":
-        for i in range(len(INT8_QUANT_TABLE)):
-            out_uint8[scaled_A > INT8_QUANT_TABLE[i]] = i
+        map = torch.tensor(INT8_QUANT_TABLE, device=scaled_A.device)
+        diff = torch.abs(scaled_A.unsqueeze(-1) - map)
+        out_uint8 = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device)
 
     if quant_type == "int8":
         out = out_uint8

From 54a2ad57a7260befebf942b28f39e7fc2f8b555b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 9 Apr 2025 02:30:21 +0800
Subject: [PATCH 228/234] Remove error log for Intel CPU/XPU (#1503)

* fix intel cpu/xpu warning

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix error log

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix lib

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* rm return Nonr

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* error log only without ipex

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix import eerror

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/cextension.py   | 35 ++++++++++++++++++++++-------------
 docs/source/installation.mdx |  4 ++--
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 52e56bf8e..e2d8295b1 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -93,6 +93,14 @@ def get_native_library() -> BNBNativeLibrary:
 
 ROCM_GPU_ARCH = get_rocm_gpu_arch()
 
+try:
+    import intel_extension_for_pytorch as ipex
+
+    assert ipex._C._has_cpu() or ipex._C._has_xpu()
+    is_ipex_available = True
+except Exception:
+    is_ipex_available = False
+
 try:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
@@ -107,16 +115,17 @@ def get_native_library() -> BNBNativeLibrary:
     lib = get_native_library()
 except Exception as e:
     lib = None
-    logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
-    if torch.cuda.is_available():
-        logger.warning(
-            f"""
-{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
-
-python -m bitsandbytes
-
-Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
-to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
-and open an issue at: https://github.com/bitsandbytes-foundation/bitsandbytes/issues
-""",
-        )
+    if not is_ipex_available:
+        logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
+        if torch.cuda.is_available():
+            logger.warning(
+                f"""
+    {BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
+
+    python -m bitsandbytes
+
+    Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
+    to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
+    and open an issue at: https://github.com/bitsandbytes-foundation/bitsandbytes/issues
+    """,
+            )
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 4f64f6385..17b2d37d5 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -341,10 +341,10 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 </hfoption>
 <hfoption id="Intel CPU + GPU">
 
-#### Intel CPU
+#### Intel CPU / XPU
 
 > [!TIP]
-> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+> Intel CPU / XPU backend only supports building from source; for now, please follow the instructions below.
 
 Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
 

From 5c48b3337df946c357ec434f0d447857644ca1ce Mon Sep 17 00:00:00 2001
From: Liangliang Ma <1906710196@qq.com>
Date: Tue, 15 Apr 2025 23:13:08 +0800
Subject: [PATCH 229/234] XPU backend support 8bit optimizer (#1565)

* enable xpu 8bit optim

* add deqaunt_blockwise

* dequantize_blockwise

* add bakcend synchronize

* refine code

* ipex dep

* ipex dep too

* ipex version check

---------

Co-authored-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/backends/cpu.py            |  3 +
 bitsandbytes/backends/cpu_xpu_common.py |  2 +-
 bitsandbytes/backends/cuda.py           |  3 +
 bitsandbytes/backends/mps.py            |  3 +
 bitsandbytes/backends/npu.py            |  3 +
 bitsandbytes/backends/xpu.py            | 77 ++++++++++++++++++++++++-
 bitsandbytes/functional.py              | 11 +++-
 bitsandbytes/optim/optimizer.py         |  5 +-
 8 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 3d99398fc..afe71c080 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -35,6 +35,9 @@ class CPUBackend(Backend):
     mm_dequant_compute_dtype = torch.bfloat16
     mm_dequant_output_dtype = torch.bfloat16
 
+    def device_synchronize(self):
+        pass
+
     def int8_double_quant(
         self,
         A: torch.Tensor,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 2a29604ba..22e2563d9 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -60,7 +60,7 @@ def _ipex_xpu_version_prereq(major, minor):
 
 def _maybe_torch_compile(func):
     # torch.compile requires g++ and pytorch >= 2.0
-    if gxx_available and _torch_version_prereq(2, 0) and not ipex_xpu:
+    if gxx_available and _torch_version_prereq(2, 0) and ipex_cpu_only:
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index f8c27255f..a3a610580 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -97,6 +97,9 @@
 
 
 class CUDABackend(Backend):
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+
     def transform(
         self,
         A: torch.Tensor,
diff --git a/bitsandbytes/backends/mps.py b/bitsandbytes/backends/mps.py
index 5b7eda0c7..9400699a9 100644
--- a/bitsandbytes/backends/mps.py
+++ b/bitsandbytes/backends/mps.py
@@ -8,6 +8,9 @@
 
 
 class MPSBackend(Backend):
+    def device_synchronize(self):
+        torch.mps.synchronize()
+
     def double_quant(
         self,
         A: torch.Tensor,
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
index d22fe04e8..cd3933879 100644
--- a/bitsandbytes/backends/npu.py
+++ b/bitsandbytes/backends/npu.py
@@ -29,6 +29,9 @@ def assert_on_npu(tensors):
 
 
 class NPUBackend(Backend):
+    def device_synchronize(self):
+        torch.npu.synchronize()
+
     def int8_double_quant(
         self,
         A: torch.Tensor,
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
index c1c20aa1e..702c3c386 100644
--- a/bitsandbytes/backends/xpu.py
+++ b/bitsandbytes/backends/xpu.py
@@ -12,11 +12,28 @@
     int8_linear_matmul_impl,
     int8_mm_dequant_impl,
     quantize_4bit_impl,
+    _ipex_xpu_version_prereq
 )
+try:
+    import intel_extension_for_pytorch as ipex
+    ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+    ipex_xpu = None
 
 Tensor = torch.Tensor
 
 
+str2optimizer8bit_blockwise = {}
+if ipex_xpu is not None and _ipex_xpu_version_prereq(2, 7):
+    str2optimizer8bit_blockwise = {
+            "adam": (
+                ipex.xpu.bitsandbytes.cadam_8bit_blockwise_grad_fp32,
+                ipex.xpu.bitsandbytes.cadam_8bit_blockwise_grad_fp16,
+                ipex.xpu.bitsandbytes.cadam_8bit_blockwise_grad_bf16,
+            ),
+        }
+
+
 def assert_on_xpu(tensors):
     on_xpu = True
     for t in tensors:
@@ -35,6 +52,9 @@ class XPUBackend(Backend):
     mm_dequant_compute_dtype = torch.bfloat16
     mm_dequant_output_dtype = torch.bfloat16
 
+    def device_synchronize(self):
+        torch.xpu.synchronize()
+
     def int8_double_quant(
         self,
         A: torch.Tensor,
@@ -185,7 +205,19 @@ def dequantize_blockwise(
         blocksize: int = 4096,
         nested=False,
     ) -> torch.Tensor:
-        raise NotImplementedError
+        if ipex_xpu is None or not _ipex_xpu_version_prereq(2, 7):
+            raise RuntimeError("Please install intel_extension_for_ipex >= 2.7 for 8bit optimizer backend on XPU device.")
+
+        # void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, cudaStream_t stream)
+        if out.dtype == torch.float16:
+            ipex.xpu.bitsandbytes.cdequantize_blockwise_fp16(code, A, absmax, out, blocksize, A.numel())
+        elif out.dtype == torch.bfloat16:
+            ipex.xpu.bitsandbytes.cdequantize_blockwise_bf16(code, A, absmax, out, blocksize, A.numel())
+        elif out.dtype == torch.float32:
+            ipex.xpu.bitsandbytes.cdequantize_blockwise_fp32(code, A, absmax, out, blocksize, A.numel())
+        else:
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {out.dtype}")
+        
 
     def quantize_blockwise(
         self,
@@ -220,7 +252,48 @@ def optimizer_update_8bit_blockwise(
         gnorm_scale: float = 1.0,
         skip_zeros=False,
     ) -> None:
-        raise NotImplementedError
+        optim_func = None
+        if ipex_xpu is None or not _ipex_xpu_version_prereq(2, 7):
+            raise RuntimeError("Please install intel_extension_for_ipex >= 2.7 for 8bit optimizer backend on XPU device.")
+
+        assert_on_xpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
+
+        if g.dtype == torch.float32 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
+        elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
+        elif (
+            g.dtype == torch.bfloat16
+            and state1.dtype == torch.uint8
+            and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
+        ):
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
+        else:
+            raise ValueError(
+                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
+            )
+        optim_func(
+            p,
+            g,
+            state1,
+            state2,
+            beta1,
+            beta2,
+            beta3,
+            alpha,
+            eps,
+            step,
+            lr,
+            qmap1,
+            qmap2,
+            absmax1,
+            absmax2,
+            weight_decay,
+            gnorm_scale,
+            skip_zeros,
+            g.numel()
+        )
+
 
     def optimizer_update_32bit(
         self,
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2b4a1e246..d1b3dd581 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -859,7 +859,16 @@ def dequantize_blockwise(
     if out is None:
         out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
 
-    if A.device.type != "cpu":
+    if A.device.type == "xpu":
+        backends[A.device.type].dequantize_blockwise(
+            A=A,
+            quant_state=quant_state,
+            absmax=absmax,
+            code=quant_state.code,
+            out=out,
+            blocksize=blocksize,
+            nested=quant_state.nested,)
+    elif A.device.type != "cpu":
         code = quant_state.code.to(A.device)
         supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
         # Some AMD GPUs have warpsize 64
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 03e0e01d7..0a78b4ade 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -10,6 +10,7 @@
 import torch
 
 import bitsandbytes.functional as F
+from bitsandbytes.backends import backends
 
 
 class MockArgs:
@@ -289,11 +290,11 @@ def step(self, closure=None):
 
                 self.prefetch_state(p)
                 self.update_step(group, p, gindex, pindex)
-                torch.cuda.synchronize()
+                backends[p.device.type].device_synchronize()
         if self.is_paged:
             # all paged operation are asynchronous, we need
             # to sync to make sure all tensors are in the right state
-            torch.cuda.synchronize()
+            backends[p.device.type].device_synchronize()
 
         return loss
 

From b090d85a335ebfd838daabd8794d0fa396531d79 Mon Sep 17 00:00:00 2001
From: Chetan Kumar Verma <39086835+ckvermaAI@users.noreply.github.com>
Date: Tue, 15 Apr 2025 20:56:58 +0530
Subject: [PATCH 230/234] HPU support for bitsandbytes (#1592)

Authored by: Chetan Kumar Verma <chetan.kumar.verma@intel.com>
Co-authored-by: Ruheena Suhani Shaik <ruheena.suhani.shaik@intel.com>
Co-authored-by: Bhargav Eede <bhargav.eede@intel.com>
Co-authored-by: Vivek Goel <vivek.goel@intel.com>

Co-authored-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
---
 bitsandbytes/__init__.py     |   7 +
 bitsandbytes/backends/hpu.py | 315 +++++++++++++++++++++++++++++++++++
 bitsandbytes/nn/modules.py   |   2 +-
 3 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 bitsandbytes/backends/hpu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index f850140a1..59f881cc9 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -28,11 +28,18 @@
     "npu",  # Ascend NPU
     "xpu",  # Intel GPU
     "cpu",
+    "hpu",  # Intel Gaudi
 }
 
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 
+# Register HPU Backend, if available
+if hasattr(torch, "hpu") and torch.hpu.is_available():
+    from .backends.hpu import HPUBackend
+
+    register_backend("hpu", HPUBackend())
+
 # Register either CUDA or ROCm backend, if available.
 # Only one of these backends can be used at a time, since the torch.device semantics are
 # the same for both torch+rocm and torch+cuda (e.g. device name is "cuda")
diff --git a/bitsandbytes/backends/hpu.py b/bitsandbytes/backends/hpu.py
new file mode 100644
index 000000000..03308cd5d
--- /dev/null
+++ b/bitsandbytes/backends/hpu.py
@@ -0,0 +1,315 @@
+import math
+from typing import Literal, Optional, Tuple
+import warnings
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+from .cpu_xpu_common import (
+    double_quant_impl,
+    dequant_8bit,
+    NF4_QUANT_TABLE,
+    INT8_QUANT_TABLE,
+)
+from bitsandbytes.functional import (
+    QuantState,
+    get_4bit_type,
+)
+
+Tensor = torch.Tensor
+
+def assert_on_hpu(tensors):
+    on_hpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_hpu &= t.device.type == "hpu"
+    if not on_hpu:
+        raise TypeError(
+            "All input tensors need to be on HPU, but found some tensors to not be on HPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_hpu
+
+class HPUBackend(Backend):
+
+    def int8_double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        assert_on_hpu([A, col_stats, row_stats,  out_col, out_row])
+        return double_quant_impl(A, col_stats, row_stats,  out_col, out_row, threshold)
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def int8_linear_matmul(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        dtype=torch.int32,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def int8_mm_dequant(
+        self,
+        A: torch.Tensor,
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["nf4"] = "nf4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        if blocksize is None:
+            blocksize = 64
+        assert_on_hpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "HPU backend only supports uint8 quant_storage"
+        return self.quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
+        
+    def quantize_4bit_impl(
+        self,
+        A: Tensor,
+        absmax: Tensor = None,
+        out: Tensor = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="nf4",
+    ) -> Tensor:
+        if quant_type not in ["nf4", "int8"]:
+            raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for HPU.")
+        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        n = A.numel()
+        input_shape = A.shape
+        blocks = n // blocksize
+        blocks += 1 if n % blocksize > 0 else 0
+
+        if absmax is None:
+            absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+
+        if out is None:
+            out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+
+        rem = n % blocksize
+        has_rem = rem > 0
+
+        # Scale tensor to [-1, 1]
+        A_reshaped = A.reshape(n)
+        A_com = A_reshaped[: n - rem]
+        A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+        absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+        scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
+        scaled_A = scaled_A.reshape(-1)
+        if has_rem:
+            absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+            scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+            scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+        # map [-1, 1] to nf4
+        out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8, device=A.device)
+        if quant_type == "nf4":
+            for i in range(len(NF4_QUANT_TABLE)):
+                out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+        elif quant_type == "int8":
+            map = torch.tensor(INT8_QUANT_TABLE, device=scaled_A.device)
+            diff = torch.abs(scaled_A.unsqueeze(-1) - map)
+            out_uint8 = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device)
+
+        if quant_type == "int8":
+            out = out_uint8
+            code = torch.Tensor(INT8_QUANT_TABLE).to(A.device)
+        else:
+            if out_uint8.size(-1) % 2:
+                out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+            # To align with HPU dequantize operator
+            out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+            code = get_4bit_type(quant_type, device=A.device)
+
+        if compress_statistics:
+            raise AssertionError("Double quantization is not supported for HPU backend")
+            offset = absmax.mean()
+            absmax -= offset
+            qabsmax, state2 = self.hpu_quantize_4bit_impl(absmax, blocksize=256, quant_type="int8")
+            del absmax
+            state = QuantState(
+                absmax=qabsmax,
+                shape=input_shape,
+                dtype=A.dtype,
+                blocksize=blocksize,
+                code=code,
+                quant_type=quant_type,
+                offset=offset,
+                state2=state2,
+            )
+        else:
+            state = QuantState(
+                absmax=absmax,
+                shape=input_shape,
+                dtype=A.dtype,
+                blocksize=blocksize,
+                code=code,
+                quant_type=quant_type,
+            )
+        return out, state
+
+    def dequantize_nf4_impl(
+        self,
+        input: torch.Tensor,
+        absmax: torch.Tensor,
+        blocksize: int,
+        quant_state: QuantState,
+    ) -> torch.Tensor:
+        """
+        HPU dequantization function for NF4 quantized tensors.
+        """
+        assert_on_hpu([input, absmax])
+        out_shape = (math.prod(quant_state.shape), )
+        out_dq = torch.ops.hpu.dequantize_nf4(input, absmax, blocksize,
+                                                out_shape=out_shape,
+                                                out_dtype=quant_state.dtype)
+        output = out_dq.reshape(quant_state.shape).T
+        return output
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["nf4"] = "nf4",
+    ) -> torch.Tensor:
+        if blocksize is None:
+            blocksize = 64
+            
+        assert_on_hpu([A, absmax, out])
+        if quant_state.nested:
+            raise AssertionError("Double quantization is not supported for HPU backend")
+            absmax = dequant_8bit(absmax, quant_state.offset, quant_state.state2)
+        return self.dequantize_nf4_impl(A, absmax, blocksize, quant_state)
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        assert_on_hpu([A, B, out])
+        if state is None:
+            raise ValueError(
+                "state cannot be None. gemv_4bit() requires the state from quantize_4bit()"
+            )
+        dqB = self.dequantize_nf4_impl(B, state.absmax, state.blocksize, state)
+        output = torch.matmul(A, dqB.to(A.dtype))
+        if out is not None:
+            out.copy_(output)
+        else:
+            out = output
+        return out
+
+    def int8_vectorwise_dequant(self, A: torch.Tensor, stats: torch.Tensor):
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def int8_vectorwise_quant(self, A: torch.Tensor, threshold=0.0):
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError("Not yet implemented for HPU backend")
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError("Not yet implemented for HPU backend")
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index eb528576d..cdeaebc27 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -345,7 +345,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type in ["cuda", "cpu", "npu", "xpu"] and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu", "npu", "xpu", "hpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:

From 5027e64a4f374e4099f5c102a1072c821188f819 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 21 Apr 2025 16:34:01 +0800
Subject: [PATCH 231/234] fix log (#1604)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/cextension.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index e2d8295b1..007bdbf8e 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -116,7 +116,10 @@ def get_native_library() -> BNBNativeLibrary:
 except Exception as e:
     lib = None
     if not is_ipex_available:
-        logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
+        logger.error(
+            f"Could not load bitsandbytes native library: {e}. If you use Intel CPU or XPU, please pip install intel_extension_for_pytorch",
+            exc_info=True,
+        )
         if torch.cuda.is_available():
             logger.warning(
                 f"""

From 263179a0f8c7b07abc207e034047e07f8e9eaf4f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 29 Apr 2025 15:31:29 +0800
Subject: [PATCH 232/234] fix xpu ipex linear in torch2.7 (#1618)

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 bitsandbytes/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index e3748685e..7d56c4ac3 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -240,10 +240,16 @@ def enable_ipex_fusion(linear, x):
         )
     elif x.device.type == "xpu" and ipex_xpu and _ipex_xpu_version_prereq(2, 5):
         converted_weight = reverse_4bit_compress_format(linear.weight.data)
-        new_weight = converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
         new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
         new_zeros = None
         compensation = None
+        new_weight = converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
+        # ipex 2.7 requires new_scales is a list of tensors
+        if _ipex_xpu_version_prereq(2, 7):
+            new_scales = list(new_scales)
+            # ipex 2.7 can dequant converted_weight directly.
+            if linear.training or x.requires_grad == False:
+                new_weight = converted_weight
     else:
         raise ValueError(
             "Please check the device and ipex version. The device should be cpu or xpu while ipex version should >= 2.5"

From 5e267f5fde01056874309c4d4a08d84292c60c1a Mon Sep 17 00:00:00 2001
From: Chetan Kumar Verma <39086835+ckvermaAI@users.noreply.github.com>
Date: Tue, 6 May 2025 00:56:05 +0530
Subject: [PATCH 233/234] update compute_type_is_set attr (#1623)

---
 bitsandbytes/nn/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index cdeaebc27..f28ef651f 100755
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -447,7 +447,7 @@ def __init__(
         )
         # self.persistent_buffers = []  # TODO consider as way to save quant state
         self.compute_dtype = compute_dtype
-        self.compute_type_is_set = False
+        self.compute_type_is_set = False if compute_dtype is None else True
         self.ipex_linear_is_set = False
         self.quant_state = None
         self.quant_storage = quant_storage

From c3eac426ef3f3473fc519b6169cc7e9172687aad Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Tue, 20 May 2025 19:20:43 +0530
Subject: [PATCH 234/234] supports HPU double quant (#1630)

---
 bitsandbytes/autograd/_functions.py |  2 +-
 bitsandbytes/backends/hpu.py        | 47 +++++++++--------------------
 2 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index d224cfe1c..b14d2024c 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -570,7 +570,7 @@ def matmul_4bit(
             return out
         else:
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
-    elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "npu":
+    elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type not in ("npu", "hpu"):
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/hpu.py b/bitsandbytes/backends/hpu.py
index 03308cd5d..2bc367078 100644
--- a/bitsandbytes/backends/hpu.py
+++ b/bitsandbytes/backends/hpu.py
@@ -1,24 +1,21 @@
 import math
 from typing import Literal, Optional, Tuple
-import warnings
+
 import torch
 
+from bitsandbytes.functional import get_4bit_type
 from bitsandbytes.utils import QuantState
 
 from .base import Backend
 from .cpu_xpu_common import (
-    double_quant_impl,
-    dequant_8bit,
-    NF4_QUANT_TABLE,
     INT8_QUANT_TABLE,
-)
-from bitsandbytes.functional import (
-    QuantState,
-    get_4bit_type,
+    NF4_QUANT_TABLE,
+    dequant_8bit,
 )
 
 Tensor = torch.Tensor
 
+
 def assert_on_hpu(tensors):
     on_hpu = True
     for t in tensors:
@@ -32,8 +29,8 @@ def assert_on_hpu(tensors):
         )
     return on_hpu
 
-class HPUBackend(Backend):
 
+class HPUBackend(Backend):
     def int8_double_quant(
         self,
         A: torch.Tensor,
@@ -43,8 +40,7 @@ def int8_double_quant(
         out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        assert_on_hpu([A, col_stats, row_stats,  out_col, out_row])
-        return double_quant_impl(A, col_stats, row_stats,  out_col, out_row, threshold)
+        raise NotImplementedError("Not yet implemented for HPU backend")
 
     def transform(
         self,
@@ -100,7 +96,7 @@ def quantize_4bit(
         assert_on_hpu([A, absmax, out])
         assert quant_storage == torch.uint8, "HPU backend only supports uint8 quant_storage"
         return self.quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
-        
+
     def quantize_4bit_impl(
         self,
         A: Tensor,
@@ -159,10 +155,9 @@ def quantize_4bit_impl(
             code = get_4bit_type(quant_type, device=A.device)
 
         if compress_statistics:
-            raise AssertionError("Double quantization is not supported for HPU backend")
             offset = absmax.mean()
             absmax -= offset
-            qabsmax, state2 = self.hpu_quantize_4bit_impl(absmax, blocksize=256, quant_type="int8")
+            qabsmax, state2 = self.quantize_4bit_impl(absmax, blocksize=256, quant_type="int8")
             del absmax
             state = QuantState(
                 absmax=qabsmax,
@@ -196,10 +191,10 @@ def dequantize_nf4_impl(
         HPU dequantization function for NF4 quantized tensors.
         """
         assert_on_hpu([input, absmax])
-        out_shape = (math.prod(quant_state.shape), )
-        out_dq = torch.ops.hpu.dequantize_nf4(input, absmax, blocksize,
-                                                out_shape=out_shape,
-                                                out_dtype=quant_state.dtype)
+        out_shape = (math.prod(quant_state.shape),)
+        out_dq = torch.ops.hpu.dequantize_nf4(
+            input, absmax, blocksize, out_shape=out_shape, out_dtype=quant_state.dtype
+        )
         output = out_dq.reshape(quant_state.shape).T
         return output
 
@@ -214,10 +209,9 @@ def dequantize_4bit(
     ) -> torch.Tensor:
         if blocksize is None:
             blocksize = 64
-            
+
         assert_on_hpu([A, absmax, out])
         if quant_state.nested:
-            raise AssertionError("Double quantization is not supported for HPU backend")
             absmax = dequant_8bit(absmax, quant_state.offset, quant_state.state2)
         return self.dequantize_nf4_impl(A, absmax, blocksize, quant_state)
 
@@ -230,18 +224,7 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        assert_on_hpu([A, B, out])
-        if state is None:
-            raise ValueError(
-                "state cannot be None. gemv_4bit() requires the state from quantize_4bit()"
-            )
-        dqB = self.dequantize_nf4_impl(B, state.absmax, state.blocksize, state)
-        output = torch.matmul(A, dqB.to(A.dtype))
-        if out is not None:
-            out.copy_(output)
-        else:
-            out = output
-        return out
+        raise NotImplementedError("Not yet implemented for HPU backend")
 
     def int8_vectorwise_dequant(self, A: torch.Tensor, stats: torch.Tensor):
         raise NotImplementedError("Not yet implemented for HPU backend")