update

dsikka · dsikka · commit eec7bd3c45eb · 2025-04-01T18:49:02.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -177,12 +177,12 @@ def _initialize_scale_zero_point(
         tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
         # Setting data for now - could possibly be handled later in the pipeline
         value = FP8_E4M3_DATA.max * FP4_E2M1_DATA.max / tensor_amax
-        # TODO: use model.weight.dtype
+        # TODO: use model.weight.dtype after checking
         value = value.to(torch.float32).to(device)
         # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
         init_global_scale = Parameter(value, requires_grad=False)
         register_offload_parameter(
-            module, f"f{base_name}_global_scale", init_global_scale
+            module, f"{base_name}_global_scale", init_global_scale
         )
 
     if scale_dtype not in [
@@ -201,7 +201,14 @@ def _initialize_scale_zero_point(
     register_offload_parameter(module, f"{base_name}_scale", init_scale)
 
     if force_zero_point or not quantization_args.symmetric:
-        zp_dtype = quantization_args.pytorch_dtype()
+        if (
+            quantization_args.num_bits == 4
+            and quantization_args.type == QuantizationType.FLOAT
+        ):
+            zp_dtype = FP8_E4M3_DATA.dtype
+        else:
+            zp_dtype = quantization_args.pytorch_dtype()
+
         init_zero_point = Parameter(
             torch.zeros(expected_shape, device=device, dtype=zp_dtype),
             requires_grad=False,
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -74,7 +74,9 @@ def calculate_qparams(
 
     bit_min, bit_max = calculate_range(quantization_args, device)
     bit_range = bit_max - bit_min
-    zp_dtype = quantization_args.pytorch_dtype()
+    # TODO: update
+    # zp_dtype = quantization_args.pytorch_dtype()
+    zp_dtype = FP8_E4M3_DATA.dtype
 
     if quantization_args.symmetric:
         # TODO: update for NVFP4 when applying observers
@@ -85,15 +87,18 @@ def calculate_qparams(
             and quantization_args.type == QuantizationType.FLOAT
         ):
             assert global_scale is not None
-            scale = max_val_pos / FP4_E2M1_DATA.max  # Not needed
-            scale = scale / global_scale
-            scale = scale.to(FP8_E4M3_DATA.dtype)  # .to(torch.float32)
+            breakpoint()
+            scales = max_val_pos / FP4_E2M1_DATA.max  # Not needed
+            scales = scales / global_scale
+            scales = scales.to(FP8_E4M3_DATA.dtype)  # .to(torch.float32)
 
         else:
             # Divide over bit range over max value?
             scales = max_val_pos / (float(bit_range) / 2)
 
-        scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
+        # TODO: clamp not implemented for FP8 '
+        breakpoint()
+        # scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
         scales = (max_vals - min_vals) / float(bit_range)