InternLM · 43758726 · Apr 2, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -8,12 +8,11 @@
 import torch
 from torch import nn
 
+from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.utils import try_import_deeplink
 
-from .calibrate import LAYER_TYPE_MAP, calibrate
-
 
 def save_vl_model(vl_model, model_path, dst_path):
     vl_model.save_pretrained(dst_path, safe_serialization=True)

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
@@ -260,14 +260,10 @@ def calibrate(model: str,
         model.config.use_cache = False
         if dtype == 'float16':
             model.half()
-        elif dtype == 'bfloat16':
+        elif dtype == 'bfloat16' or (dtype == 'auto' and model.config.torch_dtype == torch.bfloat16):
             assert torch.cuda.is_bf16_supported(
             ), 'your device does not support bfloat16 please set --dtype float16'  # noqa
             model.to(torch.bfloat16)
-        elif dtype == 'auto' and model.config.torch_dtype == torch.bfloat16:
-            print('Warning: we cast model to float16 to prevent OOM. You'
-                  ' may enforce it bfloat16 by `--dtype bfloat16`')
-            model.half()
         model.eval()
 
     model_type = type(model).__name__

diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
@@ -236,7 +236,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
               'clamping w_scales.pow(1 - alpha) to 1e-4')
         w_scales_pow = w_scales_pow.clamp(min=1e-4)
     scales = (act_scales.pow(alpha) / w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
-    scales = scales / (scales.max() * scales.min()).sqrt()
+    # prevent scales.max() * scales.min() == inf
+    denom = (scales.max().float() * scales.min().float()).sqrt()
+    denom = denom.to(dtype=dtype)
+    scales = scales / denom
 
     # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
     # phi3 fused qkv and gate_up

diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
@@ -80,6 +80,11 @@ def __init__(self,
 
     def _guess_num_heads(self, model):
 
+        if hasattr(model.config, 'text_config'):
+            model.config = model.config.text_config
+        if hasattr(model.config, 'llm_config'):
+            model.config = model.config.llm_config
+
         if hasattr(model.config, 'num_key_value_heads'):
             num_kv_heads = model.config.num_key_value_heads
         else:

diff --git a/lmdeploy/lite/utils/load.py b/lmdeploy/lite/utils/load.py
@@ -66,10 +66,6 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, dtype: Literal['float
         torch_dtype = torch.bfloat16
     elif dtype == 'float16':
         torch_dtype = torch.float16
-    elif dtype == 'auto' and torch_dtype == torch.bfloat16:
-        print('Warning: we cast model to float16 to prevent OOM. '
-              'You may enforce it bfloat16 by `--dtype bfloat16`')
-        torch_dtype = torch.float16
 
     with LoadNoInit():
         # Load model