Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 22 additions & 25 deletions ai_edge_quantizer/algorithms/uniform_quantize/common_quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1165,39 +1165,36 @@ def init_tensor_min_max(
A dictionary containing the min/max values for the tensor, or an empty
dictionary if the tensor data is None.
"""
if tensor_data is None:
weight_tensor_config = op_info.op_quant_config.weight_tensor_config
if tensor_data is None or weight_tensor_config is None:
return {}
else:
weight_tensor_config = op_info.op_quant_config.weight_tensor_config
quantized_dim = None
if weight_tensor_config is not None and (
weight_tensor_config.granularity == qtyping.QuantGranularity.CHANNELWISE
):
# Get reduce dimension for min/max calculation based on quantization
# granularity.
granularity = weight_tensor_config.granularity
if granularity == qtyping.QuantGranularity.TENSORWISE:
reduce_dims = None
keep_dims = True
elif granularity == qtyping.QuantGranularity.CHANNELWISE:
quantized_dim = common_utils.get_weight_quantized_dim(
op_info, tensor_data, weight_tensor_config.granularity
)
if (
weight_tensor_config is not None
and weight_tensor_config.granularity
== qtyping.QuantGranularity.BLOCKWISE
):
reshaped_data, reduce_dims = (
reduce_dims = common_utils.get_reduce_dims(
quantized_dim, tensor_data.shape
)
keep_dims = True
elif uniform_quantize_tensor.is_blockwise(granularity):
tensor_data, reduce_dims = (
uniform_quantize_tensor.reshape_data_for_blockwise(
tensor_data,
op_info.op_name,
weight_tensor_config.block_size,
granularity,
)
)
return {
"min": np.min(reshaped_data, axis=reduce_dims, keepdims=False),
"max": np.max(reshaped_data, axis=reduce_dims, keepdims=False),
}

keep_dims = False
else:
reduce_dims = common_utils.get_reduce_dims(
quantized_dim, tensor_data.shape
)
return {
"min": np.min(tensor_data, axis=reduce_dims, keepdims=True),
"max": np.max(tensor_data, axis=reduce_dims, keepdims=True),
}
raise ValueError(f"Unsupported granularity: {granularity}")
return {
"min": np.min(tensor_data, axis=reduce_dims, keepdims=keep_dims),
"max": np.max(tensor_data, axis=reduce_dims, keepdims=keep_dims),
}
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def get_tensor_quant_params(
op_info, tensor_quant_config, tensor_content, tensor_qsv
)

if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
if uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity):
raise ValueError(
"Blockwise quantization is not supported for dequantized weight"
" recovery."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ def test_fully_connected_blockwise_supported(self):
weight_tensor_config=_TensorQuantConfig(
num_bits=8,
symmetric=True,
granularity=qtyping.QuantGranularity.BLOCKWISE,
block_size=32,
granularity=qtyping.QuantGranularity.BLOCKWISE_32,
),
),
)
Expand Down
8 changes: 5 additions & 3 deletions ai_edge_quantizer/algorithms/uniform_quantize/mse.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_tensor_quant_params(
ValueError: `tensor_qsv` must contain min/max values, or `tensor_content`
must be provided so that they can be inferred.
"""
if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
if uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity):
raise ValueError(
"Blockwise quantization is not supported for MSE quantization."
)
Expand Down Expand Up @@ -113,13 +113,15 @@ def get_tensor_quant_params(
num_bits=tensor_quant_config.num_bits,
symmetric=tensor_quant_config.symmetric,
quantized_dimension=quantized_dim,
block_size=tensor_quant_config.block_size,
block_size=uniform_quantize_tensor.extract_block_size_from_granularity(
tensor_quant_config.granularity
),
)

quantized_vars = uniform_quantize_tensor.uniform_quantize(
tensor_content,
quant_params,
tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE,
uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity),
)

return dataclasses.replace(quant_params, quantized_data=quantized_vars)
2 changes: 1 addition & 1 deletion ai_edge_quantizer/algorithms/uniform_quantize/mse_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_get_tensor_quant_params_raises_error_with_unsupported_granularity(
tensor_quant_config=qtyping.TensorQuantizationConfig(
num_bits=4,
symmetric=True,
granularity=qtyping.QuantGranularity.BLOCKWISE,
granularity=qtyping.QuantGranularity.BLOCKWISE_32,
),
tensor_content=test_data,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

"""Performs naive min/max uniform quantization."""

import dataclasses
from typing import Any, Optional
import numpy as np
from ai_edge_quantizer import qtyping
Expand Down Expand Up @@ -91,26 +92,20 @@ def get_tensor_quant_params(
num_bits=tensor_quant_config.num_bits,
symmetric=tensor_quant_config.symmetric,
quantized_dimension=quantized_dim,
block_size=tensor_quant_config.block_size,
block_size=uniform_quantize_tensor.extract_block_size_from_granularity(
tensor_quant_config.granularity
),
)
if tensor_content is None:
return quant_params

quantized_vars = uniform_quantize_tensor.uniform_quantize(
tensor_content,
quant_params,
tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE,
uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity),
)
# Update with quantized values.
return qtyping.UniformQuantParams(
scale=scale,
zero_point=zp,
num_bits=tensor_quant_config.num_bits,
symmetric=tensor_quant_config.symmetric,
quantized_dimension=quantized_dim,
quantized_data=quantized_vars,
block_size=tensor_quant_config.block_size,
)
return dataclasses.replace(quant_params, quantized_data=quantized_vars)


# TODO: b/333731147 - Use named tuple to store min/max.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import cast

from absl.testing import parameterized
import ml_dtypes
import numpy as np

from tensorflow.python.platform import googletest
Expand Down Expand Up @@ -165,8 +166,7 @@ def test_get_tensor_quant_params_for_blockwise_weight(self):
weight_tensor_config = _TensorQuantConfig(
num_bits=4,
symmetric=True,
granularity=qtyping.QuantGranularity.BLOCKWISE,
block_size=2,
granularity=qtyping.QuantGranularity.BLOCKWISE_32,
)
op_info = qtyping.OpInfo(
op=fc_op,
Expand All @@ -176,28 +176,32 @@ def test_get_tensor_quant_params_for_blockwise_weight(self):
weight_tensor_config=weight_tensor_config,
),
)
test_data = np.array([[-7, 7], [4, -4], [4, -4], [7, 7]])
test_data = np.random.uniform(low=-10, high=10, size=(4, 32)).astype(
np.float32
)
quant_params = naive_min_max_quantize.get_tensor_quant_params(
op_info=op_info,
tensor_quant_config=weight_tensor_config,
tensor_content=test_data,
)
scale = quant_params.scale
zp = quant_params.zero_point
expected_scale = np.array([
[1],
[0.5703125],
[0.5703125],
[1],
])
expected_zp = np.zeros([4, 1])
self.assertTrue(np.array_equal(zp, expected_zp))
self.assertTrue(np.array_equal(scale, expected_scale))
self.assertEqual(zp.shape, (4, 1))
self.assertTrue(np.array_equal(zp, np.zeros([4, 1])))

self.assertEqual(quant_params.scale.shape, (4, 1))
expected_scales = np.max(np.abs(test_data), axis=1, keepdims=True) / 7.0
expected_scales = (
expected_scales.astype(ml_dtypes.bfloat16)
.astype(np.float16)
.astype(np.float32)
)
self.assertTrue(np.allclose(quant_params.scale, expected_scales, atol=1e-5))

self.assertIsNotNone(quant_params.quantized_data)
self.assertTupleEqual(
cast(np.ndarray, quant_params.quantized_data).shape, test_data.shape
)
self.assertEqual(quant_params.block_size, 2)
self.assertEqual(quant_params.block_size, 32)
self.assertEqual(quant_params.quantized_dimension, 1)

def test_calibrate_ignores_inf_min_max(self):
Expand Down
14 changes: 9 additions & 5 deletions ai_edge_quantizer/algorithms/uniform_quantize/octav.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,12 @@ def get_tensor_quant_params(
quantized_dim = common_utils.get_weight_quantized_dim(
op_info, tensor_content, tensor_quant_config.granularity
)
if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
if uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity):
reshaped_data, reduce_dims = (
uniform_quantize_tensor.reshape_data_for_blockwise(
tensor_content,
op_info.op_name,
tensor_quant_config.block_size,
tensor_quant_config.granularity,
)
)
else:
Expand All @@ -154,7 +154,7 @@ def get_tensor_quant_params(
# We created a new dimension in order to reduce properly for blockwise
# quantization, so we need to reshape the clipping constants back to the
# min/max shape for the next step.
if tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE:
if uniform_quantize_tensor.is_blockwise(tensor_quant_config.granularity):
clipping_constants = clipping_constants.reshape(tensor_min_max["min"].shape)

zp, scale = uniform_quantize_tensor.tensor_zp_scale_from_min_max(
Expand All @@ -172,13 +172,17 @@ def get_tensor_quant_params(
num_bits=tensor_quant_config.num_bits,
symmetric=tensor_quant_config.symmetric,
quantized_dimension=quantized_dim,
block_size=tensor_quant_config.block_size,
block_size=uniform_quantize_tensor.extract_block_size_from_granularity(
tensor_quant_config.granularity
),
)

quantized_vars = uniform_quantize_tensor.uniform_quantize(
tensor_content,
quant_params,
tensor_quant_config.granularity == qtyping.QuantGranularity.BLOCKWISE,
is_blockwise_quant=uniform_quantize_tensor.is_blockwise(
tensor_quant_config.granularity
),
)

return dataclasses.replace(quant_params, quantized_data=quantized_vars)
3 changes: 1 addition & 2 deletions ai_edge_quantizer/algorithms/uniform_quantize/octav_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,7 @@ def test_get_tensor_quant_params_sanity_blockwise(self):
tensor_config = qtyping.TensorQuantizationConfig(
num_bits=4,
symmetric=True,
granularity=qtyping.QuantGranularity.BLOCKWISE,
block_size=32,
granularity=qtyping.QuantGranularity.BLOCKWISE_32,
)
fc_op_info = qtyping.OpInfo(
op=self._fc_op,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ class IntType:
signed: bool


def is_blockwise(granularity: qtyping.QuantGranularity) -> bool:
"""Checks if the quantization granularity is blockwise."""
return "BLOCKWISE" in str(granularity)


def get_quantized_range(qtype: IntType) -> tuple[float, float]:
"""Calculates range of the quantized type."""
if qtype.signed:
Expand All @@ -40,6 +45,22 @@ def get_quantized_range(qtype: IntType) -> tuple[float, float]:
return float(qmin), float(qmax)


def extract_block_size_from_granularity(
granularity: qtyping.QuantGranularity,
) -> int:
"""Get the block size for blockwise quantization."""
if granularity == qtyping.QuantGranularity.BLOCKWISE_32:
return 32
elif granularity == qtyping.QuantGranularity.BLOCKWISE_64:
return 64
elif granularity == qtyping.QuantGranularity.BLOCKWISE_128:
return 128
elif granularity == qtyping.QuantGranularity.BLOCKWISE_256:
return 256
else:
return 0


def _round_and_clip(
tensor: np.ndarray, qtype: IntType, narrow: bool
) -> np.ndarray:
Expand Down Expand Up @@ -157,26 +178,28 @@ def _get_tensor_shape_for_blockwise(


def reshape_data_for_blockwise(
tensor_data: np.ndarray, op_name: qtyping.TFLOperationName, block_size: int
tensor_data: np.ndarray,
op_name: qtyping.TFLOperationName,
granularity: qtyping.QuantGranularity,
) -> tuple[np.ndarray, int]:
"""Reshapes data for blockwise quantization.

Args:
tensor_data: The original tensor data.
op_name: The name of the TFL op.
block_size: The size of the block.
granularity: The quantization granularity for the tensor.

Returns:
A tuple containing the reshaped tensor data and the new reduce dimension.
"""
quantized_dim = tfl_flatbuffer_utils.TFL_OP_TO_BLOCKWISE_WEIGHT_QUANTIZED_DIM[
op_name
]
block_size = extract_block_size_from_granularity(granularity)
new_shape = _get_tensor_shape_for_blockwise(
tensor_data.shape, quantized_dim, block_size
)
reshaped_data = tensor_data.reshape(new_shape)
return reshaped_data, quantized_dim + 1
return tensor_data.reshape(new_shape), quantized_dim + 1


def _broadcast_scale_zp_for_blockwise(
Expand Down Expand Up @@ -233,21 +256,21 @@ def _broadcast_scale_zp_for_blockwise(
def uniform_quantize(
tensor_data: np.ndarray,
quantization_params: qtyping.UniformQuantParams,
is_blockwise: bool = False,
is_blockwise_quant: bool = False,
):
"""Uniform quantize a tensor.

Args:
tensor_data: The tensor to be quantized.
quantization_params: The quantization parameters.
is_blockwise: Whether the tensor is blockwise quantized.
is_blockwise_quant: Whether the tensor is blockwise quantized.

Returns:
The quantized tensor.
"""
# The reshaping for blockwise quantization is unique hence we do this here
# to avoid unexpected broadcast behavior downstream.
if is_blockwise:
if is_blockwise_quant:
quantization_params = _broadcast_scale_zp_for_blockwise(
tensor_data, quantization_params
)
Expand Down Expand Up @@ -435,6 +458,7 @@ def tensor_zp_scale_from_min_max(
Returns:
The zero point and scale of the tensor.
"""

# TODO: b/332574603 - support unsigned data type.
qtype = IntType(
num_bits,
Expand All @@ -445,7 +469,7 @@ def tensor_zp_scale_from_min_max(
pos_clipping_values = None if clipping_values is None else clipping_values
neg_clipping_values = None if clipping_values is None else -clipping_values

if granularity == qtyping.QuantGranularity.BLOCKWISE:
if is_blockwise(granularity):
# Blockwise quantization uses float16 scale,
# with 7 bit mantissa, so the maximum scale value is 65280 and maximum
# representable range is [-65280 * (2 ** num_bits),
Expand Down Expand Up @@ -493,7 +517,7 @@ def tensor_zp_scale_from_min_max(
zp = qmin - bound_min / scale
zp = np.rint(zp)

if granularity == qtyping.QuantGranularity.BLOCKWISE:
if is_blockwise(granularity):
# Round the scale values to 7 bit mantissa.
scale = (
scale.astype(ml_dtypes.bfloat16).astype(np.float16).astype(np.float32)
Expand Down
Loading