From a1c08c5336fb1d35994d26df267178b9dc488482 Mon Sep 17 00:00:00 2001
From: YZW-explorer <2586753502@qq.com>
Date: Wed, 15 May 2024 08:51:43 +0000
Subject: [PATCH 1/2] FP8 PTQ With Physical Dependency 0515

---
 paddleslim/quant/observers/__init__.py   |   5 +-
 paddleslim/quant/observers/fp8uniform.py | 142 +++++++++++++++++++++++
 2 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 paddleslim/quant/observers/fp8uniform.py

diff --git a/paddleslim/quant/observers/__init__.py b/paddleslim/quant/observers/__init__.py
index 0b7970ba8..2d6801017 100644
--- a/paddleslim/quant/observers/__init__.py
+++ b/paddleslim/quant/observers/__init__.py
@@ -21,7 +21,7 @@
 from .mse_weight import MSEChannelWiseWeightObserver
 from .abs_max_weight import AbsMaxChannelWiseWeightObserver
 from .groupwise import GroupWiseWeightObserver
-
+from .fp8uniform import FP8UniformObserver
 __all__ = [
     "HistObserver",
     "KLObserver",
@@ -32,5 +32,6 @@
     "AbsmaxObserver",
     "MSEChannelWiseWeightObserver",
     "AbsMaxChannelWiseWeightObserver",
-    "GroupWiseWeightObserver"
+    "GroupWiseWeightObserver",
+    "FP8UniformObserver",
 ]
diff --git a/paddleslim/quant/observers/fp8uniform.py b/paddleslim/quant/observers/fp8uniform.py
new file mode 100644
index 000000000..5ea65d0b0
--- /dev/null
+++ b/paddleslim/quant/observers/fp8uniform.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from paddle.quantization.base_observer import BaseObserver
+import paddle
+from paddle.quantization.factory import ObserverFactory
+class FP8UniformObserver(ObserverFactory):
+    r"""
+    It collects maximum absolute values of target tensor.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import FakeQuanterWithAbsMaxObserver
+            quanter = FakeQuanterWithAbsMaxObserver(moving_rate=0.99)
+            q_config = QuantConfig(activation=quanter, weight=quanter)
+    """
+
+    def __init__(self):
+        super(FP8UniformObserver, self).__init__()
+
+    def _get_class(self):
+        return FP8UniformObserverLayer
+
+class FP8UniformObserverLayer(BaseObserver):
+    """ This is the base class for a uniform quantization observer, which provides
+    common functions for calculating the scale and zero-point used in uniform quantization.
+    Uniform quantization maps floating point values to integers, where the scale determines
+    the step size of the quantizer and the floating point zero is mapped to the zero-point,
+    an integer value ensuring that zero is quantized without error.
+
+    Args:
+        quant_bits (int): The number of bits for quantization.
+        sign (bool): Whether the quantized integer includes a sign.
+        symmetric (bool): Whether it is symmetric quantization. the quantization is symmetric.
+        In symmetric quantization, the range of floating point values is relaxed to be symmetric
+        around zero and the zero-point is always 0.
+
+    """
+
+    def __init__(
+        self,
+        layer,
+        quant_bits=8,
+        ):
+        super(FP8UniformObserverLayer, self).__init__()
+        self._float8_type = "float8_e4m3fn"
+        self._quant_bits = 8 
+        self._min = None
+        self._max = paddle.to_tensor(1e-7, dtype="float32")
+        self._qmin = None
+        self._qmax = None
+        self._scale = None
+        self._zero_point = None
+
+    def qmin_qmax(self):
+        """ Calculate the range of the quantized integer based on the specified
+        float8_type."""
+        if self._float8_type == "float8_e4m3fn":
+            self._qmin = -448.0
+            self._qmax = 448.0
+        else:
+            self._qmin = -57344.0
+            self._qmax = +57344.0
+        return self._qmin, self._qmax
+
+    def min_value(self) -> float:
+        """ The minimum value of floating-point numbers."""
+        return self._min
+
+    def max_value(self) -> float:
+        """ The maximum value of floating-point numbers."""
+        return self._max
+    def cal_scales(self) -> float:
+        """ Calculate the scales and zero points based on the min_value and max_value.
+        """
+        assert self.min_value() is not None and self.max_value() is not None
+        _qmin, _qmax = self.qmin_qmax()
+        # For one-sided distributions, the range (_min , _max ) is relaxed to include zero.
+        # It is important to ensure that common operations like zero padding do not cause quantization errors.
+        _min = min(self.min_value(), 0.)
+        _max = max(self.max_value(), 0.)
+        _abs_max = max(-_min, _max)
+        self._scale = _qmax / _abs_max
+        self._zero_point = 0
+        return self._scale, self._zero_point
+
+    def scales(self):
+        """ Return output scales.
+        """
+        
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+    def forward(self, inputs):
+        """ Calculate forward pass.
+        """
+        self._min, self._max = self.cal_min_max(inputs)
+        return inputs
+
+    def cal_min_max(self, inputs):
+        abs_max_val = paddle.max(paddle.abs(inputs.cast("float32")))
+        abs_max_val = paddle.maximum(abs_max_val, self._max)
+        return 0, abs_max_val
+    def bit_length(self):
+        """ Return the bit length of quantized data.
+        """
+        return self._quant_bits
+
+    def quant_axis(self):
+        """ Return quantization axis.
+        """
+        return -1
+    def cal_thresholds(self):
+        """ Compute thresholds for MAX function.
+        """
+        if self._scale is not None:
+            self._zero_point = 0
+            return
+        self._scale, self._zero_point = self.cal_scales()
+    def zero_points(self):
+        """ Return output zero points.
+        """
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point

From 44ec09be16c15e8bed54599e78c23dc6a7e412f1 Mon Sep 17 00:00:00 2001
From: YZW-explorer <2586753502@qq.com>
Date: Thu, 16 May 2024 07:48:36 +0000
Subject: [PATCH 2/2] FP8 PTQ With Physical Dependency 0516

---
 paddleslim/quant/layers/__init__.py          |  4 +-
 paddleslim/quant/layers/fp8quanted_linear.py | 41 ++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 paddleslim/quant/layers/fp8quanted_linear.py

diff --git a/paddleslim/quant/layers/__init__.py b/paddleslim/quant/layers/__init__.py
index 34e5a1ea5..7a33d3f40 100644
--- a/paddleslim/quant/layers/__init__.py
+++ b/paddleslim/quant/layers/__init__.py
@@ -13,5 +13,5 @@
 # limitations under the License.
 
 from .parallel_linear import QuantizedColumnParallelLinear, QuantizedRowParallelLinear
-
-__all__ = ["QuantizedColumnParallelLinear", "QuantizedRowParallelLinear"]
\ No newline at end of file
+from .fp8quanted_linear import FP8QuantedLinear
+__all__ = ["QuantizedColumnParallelLinear", "QuantizedRowParallelLinear", "FP8QuantedLinear"]
\ No newline at end of file
diff --git a/paddleslim/quant/layers/fp8quanted_linear.py b/paddleslim/quant/layers/fp8quanted_linear.py
new file mode 100644
index 000000000..e865c8d1b
--- /dev/null
+++ b/paddleslim/quant/layers/fp8quanted_linear.py
@@ -0,0 +1,41 @@
+from paddle.nn import functional as F
+class FP8QuantedLinear(FP8ConvertibleQuantedLayer):
+    """
+    The computational logic of QuantizedLinear is the same as Linear.
+    The only difference is that its inputs are all fake quantized.
+    """
+
+    def __init__(self, layer: Layer, q_config):
+        super().__init__()
+        # For Linear
+        self.weight = layer.weight
+        self.bias = layer.bias
+        self.name = layer.name
+        # For FakeQuant
+
+        self.weight_quanter = None
+        self.activation_quanter = None
+        if q_config.weight is not None:
+            self.weight_quanter = q_config.weight._instance(layer)
+        if q_config.activation is not None:
+            self.activation_quanter = q_config.activation._instance(layer)
+
+    def forward(self, input):
+        quant_input = input
+        quant_weight = self.weight
+        if self.activation_quanter is not None:
+            quant_input = self.activation_quanter(input)
+        if self.weight_quanter is not None:
+            quant_weight = self.weight_quanter(self.weight)
+        return self._linear_forward(quant_input, quant_weight)
+
+    def _linear_forward(self, input, weight):
+        out = F.linear(x=input, weight=weight, bias=self.bias, name=self.name)
+        return out
+
+    def weights_to_quanters(self):
+        return [('weight', 'weight_quanter')]
+
+    def activation_quanters(self):
+        return ['activation_quanter']
+