Add recipe building utils to AEQ.

rewu93 · copybara-github · commit 451ac5c75cbe · 2025-08-14T11:39:59.000-07:00
PiperOrigin-RevId: 795129649
diff --git a/ai_edge_quantizer/recipe.py b/ai_edge_quantizer/recipe.py
@@ -15,51 +15,163 @@
 
 """Quantization recipe module."""
 
+from ai_edge_quantizer import algorithm_manager
+from ai_edge_quantizer import qtyping
+from ai_edge_quantizer import recipe_manager
 
-def dynamic_wi8_afp32():
-  """Returns a dynamic quantization recipe with int8 weights and float32 activation."""
-  return [
-      dict({
-          'regex': '.*',
-          'operation': '*',
-          'algorithm_key': 'min_max_uniform_quantize',
-          'op_config': {
-              'weight_tensor_config': {
-                  'num_bits': 8,
-                  'symmetric': True,
-                  'granularity': 'CHANNELWISE',
-                  'dtype': 'INT',
-                  'block_size': 0,
-              },
-              'compute_precision': 'INTEGER',
-              'explicit_dequantize': False,
-              'skip_checks': False,
-          },
-      })
-  ]
+AlgorithmName = algorithm_manager.AlgorithmName
 
 
-def dynamic_wi4_afp32():
-  """Returns a dynamic quantization recipe with int4 weights and float32 activation."""
-  return [
-      dict({
-          'regex': '.*',
-          'operation': '*',
-          'algorithm_key': 'min_max_uniform_quantize',
-          'op_config': {
-              'weight_tensor_config': {
-                  'num_bits': 4,
-                  'symmetric': True,
-                  'granularity': 'CHANNELWISE',
-                  'dtype': 'INT',
-                  'block_size': 0,
-              },
-              'compute_precision': 'INTEGER',
-              'explicit_dequantize': False,
-              'skip_checks': False,
-          },
-      })
-  ]
+def dynamic_wi8_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a dynamic quantization recipe with int8 weights and float32 activation.
+
+  All supported ops will be quantized with int8 weights and float32 activations,
+  which will be dynamically quantized to int8 during inference to enable int8
+  compute. The model quality may suffer due to the on-the-fly quantization. If
+  quality is a concern, consider using weight-only quantization.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A dynamic quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_dynamic_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+
+
+def dynamic_wi4_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a dynamic quantization recipe with int4 weights and float32 activation.
+
+  All supported ops will be quantized with int4 weights and float32 activations,
+  which will be dynamically quantized to int4 during inference to enable int4
+  compute.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A dynamic quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_dynamic_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=4,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+
+
+def weight_only_wi8_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a weight-only quantization recipe with int8 weights and float32 activation.
+
+  All supported ops will be quantized with int8 weights and float32 activations.
+  The weights will be explicitly dequantized before being fed into the op to
+  enable float compute thus retain model quality. If latency is a concern,
+  consider using dynamic range quantization.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A weight-only quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_weight_only_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+
+
+def weight_only_wi4_afp32(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a weight-only quantization recipe with int4 weights and float32 activation.
+
+  All supported ops will be quantized with int4 weights and float32 activations.
+  The weights will be explicitly dequantized before being fed into the op to
+  enable float compute thus retain model quality.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A weight-only quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_weight_only_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      num_bits=4,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+
+
+def static_wi8_ai8(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a static quantization recipe with int8 weights and int8 activations.
+
+  All supported ops will be quantized with int8 weights and int8 activations.
+  Calibration is needed to use this recipe.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A static quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_static_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      activation_num_bits=8,
+      weight_num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
+
+
+def static_wi8_ai16(
+    algorithm_key: AlgorithmName = AlgorithmName.MIN_MAX_UNIFORM_QUANT,
+):
+  """Returns a static quantization recipe with int8 weights and int16 activations.
+
+  All supported ops will be quantized with int8 weights and int16 activations.
+  Calibration is needed to use this recipe.
+
+  Args:
+    algorithm_key: The algorithm to use for quantization.
+
+  Returns:
+    A static quantization recipe.
+  """
+  rp_manager = recipe_manager.RecipeManager()
+  rp_manager.add_static_config(
+      regex='.*',
+      operation_name=qtyping.TFLOperationName.ALL_SUPPORTED,
+      activation_num_bits=16,
+      weight_num_bits=8,
+      algorithm_key=algorithm_key,
+  )
+  return rp_manager.get_quantization_recipe()
 
 
 def dynamic_legacy_wi8_afp32():
diff --git a/ai_edge_quantizer/recipe_test.py b/ai_edge_quantizer/recipe_test.py
@@ -21,6 +21,7 @@
 from ai_edge_quantizer import quantizer
 from ai_edge_quantizer import recipe
 from ai_edge_quantizer.utils import test_utils
+from ai_edge_quantizer.utils import tfl_interpreter_utils
 
 
 _TEST_DATA_PREFIX_PATH = test_utils.get_path_to_datafile('')
@@ -30,33 +31,76 @@ class RecipeTest(parameterized.TestCase):
 
   def setUp(self):
     super().setUp()
-    self._test_model_path = os.path.join(
+    # Weights has < 1024 elements so legacy recipe will not quantize it.
+    self._small_model_path = os.path.join(
         _TEST_DATA_PREFIX_PATH,
         'tests/models/single_conv2d_transpose_bias.tflite',
     )
+    self._test_model_path = os.path.join(
+        _TEST_DATA_PREFIX_PATH,
+        'tests/models/conv_fc_mnist.tflite',
+    )
 
-  def _quantize_with_recipe_func(self, recipe_func):
-    qt = quantizer.Quantizer(self._test_model_path)
+  def _quantize_with_recipe_func(self, recipe_func, test_model_path):
+    qt = quantizer.Quantizer(test_model_path)
     qt.load_quantization_recipe(recipe_func())
     self.assertIsNone(qt._result.quantized_model)
-    quant_result = qt.quantize()
-    self.assertIsNotNone(quant_result.quantized_model)
-    return quant_result
+    if qt.need_calibration:
+      calibration_data = tfl_interpreter_utils.create_random_normal_input_data(
+          qt.float_model,
+          num_samples=1,
+      )
+      calibration_result = qt.calibrate(calibration_data)
+      quantization_result = qt.quantize(calibration_result)
+    else:
+      quantization_result = qt.quantize()
+    self.assertIsNotNone(quantization_result.quantized_model)
+    return quantization_result
 
   def test_quantization_from_dynamic_wi8_afp32_func_succeeds(self):
-    quant_result = self._quantize_with_recipe_func(recipe.dynamic_wi8_afp32)
+    quant_result = self._quantize_with_recipe_func(
+        recipe.dynamic_wi8_afp32, self._test_model_path
+    )
+    self.assertLess(
+        len(quant_result.quantized_model),
+        os.path.getsize(self._test_model_path),
+    )
+
+  def test_quantization_from_dynamic_wi4_afp32_func_succeeds(self):
+    quant_result = self._quantize_with_recipe_func(
+        recipe.dynamic_wi4_afp32, self._test_model_path
+    )
+    self.assertLess(
+        len(quant_result.quantized_model),
+        os.path.getsize(self._test_model_path),
+    )
+
+  def test_quantization_from_weight_only_wi8_afp32_func_succeeds(self):
+    quant_result = self._quantize_with_recipe_func(
+        recipe.weight_only_wi8_afp32, self._test_model_path
+    )
+    self.assertLess(
+        len(quant_result.quantized_model),
+        os.path.getsize(self._test_model_path),
+    )
+
+  def test_quantization_from_weight_only_wi4_afp32_func_succeeds(self):
+    quant_result = self._quantize_with_recipe_func(
+        recipe.weight_only_wi4_afp32, self._test_model_path
+    )
     self.assertLess(
         len(quant_result.quantized_model),
         os.path.getsize(self._test_model_path),
     )
 
   def test_quantization_from_dynamic_legacy_wi8_afp32_func_succeeds(self):
     quant_result = self._quantize_with_recipe_func(
-        recipe.dynamic_legacy_wi8_afp32
+        recipe.dynamic_legacy_wi8_afp32,
+        self._small_model_path,
     )
     self.assertLen(
         quant_result.quantized_model,
-        os.path.getsize(self._test_model_path),
+        os.path.getsize(self._small_model_path),
     )
 
   @parameterized.named_parameters(
@@ -65,28 +109,54 @@ def test_quantization_from_dynamic_legacy_wi8_afp32_func_succeeds(self):
           recipe_json_path='recipes/dynamic_wi8_afp32_recipe.json',
           recipe_func=recipe.dynamic_wi8_afp32,
       ),
+      dict(
+          testcase_name='weight_only_wi8_afp32',
+          recipe_json_path='recipes/default_af32w8float_recipe.json',
+          recipe_func=recipe.weight_only_wi8_afp32,
+      ),
+      dict(
+          testcase_name='weight_only_wi4_afp32',
+          recipe_json_path='recipes/default_af32w4float_recipe.json',
+          recipe_func=recipe.weight_only_wi4_afp32,
+      ),
       dict(
           testcase_name='dynamic_legacy_wi8_afp32',
           recipe_json_path='recipes/dynamic_legacy_wi8_afp32_recipe.json',
           recipe_func=recipe.dynamic_legacy_wi8_afp32,
       ),
+      dict(
+          testcase_name='a8w8',
+          recipe_json_path='recipes/default_a8w8_recipe.json',
+          recipe_func=recipe.static_wi8_ai8,
+      ),
+      dict(
+          testcase_name='a16w8',
+          recipe_json_path='recipes/default_a16w8_recipe.json',
+          recipe_func=recipe.static_wi8_ai16,
+      ),
   )
   def test_recipe_func_and_json_matches(self, recipe_json_path, recipe_func):
     # Quantize with recipe from function in recipe module.
-    quant_result_from_func = self._quantize_with_recipe_func(recipe_func)
+    quant_result_from_func = self._quantize_with_recipe_func(
+        recipe_func, self._test_model_path
+    )
 
     # Quantize with recipe from json file.
     qt_json = quantizer.Quantizer(self._test_model_path)
     json_recipe_path = os.path.join(_TEST_DATA_PREFIX_PATH, recipe_json_path)
     qt_json.load_quantization_recipe(json_recipe_path)
-    quant_result_from_json = qt_json.quantize()
+    if qt_json.need_calibration:
+      calibration_data = tfl_interpreter_utils.create_random_normal_input_data(
+          qt_json.float_model,
+          num_samples=1,
+      )
+      calibration_result = qt_json.calibrate(calibration_data)
+      quant_result_from_json = qt_json.quantize(calibration_result)
+    else:
+      quant_result_from_json = qt_json.quantize()
     self.assertIsNotNone(quant_result_from_json.quantized_model)
 
-    # Check if the recipes and quantized models match.
-    self.assertEqual(
-        quant_result_from_func.recipe,
-        quant_result_from_json.recipe,
-    )
+    # Check if the quantized models match.
     self.assertEqual(
         len(quant_result_from_func.quantized_model),
         len(quant_result_from_json.quantized_model),