Add block_size attribute for nf4 operator

wanghaoshuang · wanghaoshuang · commit 7cac65dec8ca · 2023-11-02T16:28:01.000+08:00
diff --git a/csrc/lc/nf4.cu b/csrc/lc/nf4.cu
@@ -185,23 +185,23 @@ std::vector<paddle::Tensor>  LaunchQuantizeNF4(const paddle::Tensor& input, int
     
     auto abs_max = paddle::full({num_blocks}, 1, paddle::DataType::FLOAT32, input.place());
 
-    const DataType_ *in_ptr = reinterpret_cast<const DataType_*>(input.data<data_t>());
-    unsigned char *out_ptr = output.mutable_data<unsigned char>();
-    float *abs_max_ptr = abs_max.mutable_data<float>();
-
-    if(block_size == 2048) {
-      kQuantizeBlockwiseNF4<DataType_, 2048, 4><<<num_blocks, 512>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    } else if(block_size == 1024) {
-      kQuantizeBlockwiseNF4<DataType_, 1024, 4><<<num_blocks, 256>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    } else if(block_size == 512) {
-      kQuantizeBlockwiseNF4<DataType_, 512, 2><<<num_blocks, 256>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    } else if(block_size == 256) {
-      kQuantizeBlockwiseNF4<DataType_, 256, 2><<<num_blocks, 128>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    } else if(block_size == 128) {
-      kQuantizeBlockwiseNF4<DataType_, 128, 2><<<num_blocks, 64>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    } else if(block_size == 64) {
-      kQuantizeBlockwiseNF4<DataType_, 64, 2><<<num_blocks, 32>>>(in_ptr, abs_max_ptr, out_ptr, n);
-    }
+    // const DataType_ *in_ptr = reinterpret_cast<const DataType_*>(input.data<data_t>());
+    // unsigned char *out_ptr = output.mutable_data<unsigned char>();
+    // float *abs_max_ptr = abs_max.mutable_data<float>();
+
+    // if(block_size == 2048) {
+    //   kQuantizeBlockwiseNF4<DataType_, 2048, 4><<<num_blocks, 512>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // } else if(block_size == 1024) {
+    //   kQuantizeBlockwiseNF4<DataType_, 1024, 4><<<num_blocks, 256>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // } else if(block_size == 512) {
+    //   kQuantizeBlockwiseNF4<DataType_, 512, 2><<<num_blocks, 256>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // } else if(block_size == 256) {
+    //   kQuantizeBlockwiseNF4<DataType_, 256, 2><<<num_blocks, 128>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // } else if(block_size == 128) {
+    //   kQuantizeBlockwiseNF4<DataType_, 128, 2><<<num_blocks, 64>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // } else if(block_size == 64) {
+    //   kQuantizeBlockwiseNF4<DataType_, 64, 2><<<num_blocks, 32>>>(in_ptr, abs_max_ptr, out_ptr, n);
+    // }
     return {output, abs_max};
 }
 
@@ -226,10 +226,8 @@ std::vector<paddle::Tensor> QuantizeNF4(const paddle::Tensor& input, int block_s
     }
 }
 
-
-
-
 PD_BUILD_OP(quantize_nf4)
     .Inputs({"input"})
     .Outputs({"out", "abs_max"})
+    .Attrs({"block_size: int"})
     .SetKernelFn(PD_KERNEL(QuantizeNF4));
diff --git a/paddleslim/lc/quantizers/nf4.py b/paddleslim/lc/quantizers/nf4.py
@@ -14,7 +14,8 @@ def __init__(self, block_size=64, double_quant=False):
         self.double_quant_scale = None
 
     def quantize(self, x: paddle.Tensor):
-        out, abs_max = paddleslim_ops.quantize_nf4(x)
+        out, abs_max = paddleslim_ops.quantize_nf4(
+            x, block_size=self.block_size)
         self.quant_scale = abs_max
         return out