Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions lmdeploy/lite/apis/auto_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
import torch
from torch import nn

from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate
from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers
from lmdeploy.lite.utils import collect_target_modules
from lmdeploy.utils import try_import_deeplink

from .calibrate import LAYER_TYPE_MAP, calibrate


def save_vl_model(vl_model, model_path, dst_path):
vl_model.save_pretrained(dst_path, safe_serialization=True)
Expand Down
6 changes: 1 addition & 5 deletions lmdeploy/lite/apis/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,14 +260,10 @@ def calibrate(model: str,
model.config.use_cache = False
if dtype == 'float16':
model.half()
elif dtype == 'bfloat16':
elif dtype == 'bfloat16' or (dtype == 'auto' and model.config.torch_dtype == torch.bfloat16):
assert torch.cuda.is_bf16_supported(
), 'your device does not support bfloat16 please set --dtype float16' # noqa
model.to(torch.bfloat16)
elif dtype == 'auto' and model.config.torch_dtype == torch.bfloat16:
print('Warning: we cast model to float16 to prevent OOM. You'
' may enforce it bfloat16 by `--dtype bfloat16`')
model.half()
model.eval()

model_type = type(model).__name__
Expand Down
5 changes: 4 additions & 1 deletion lmdeploy/lite/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
'clamping w_scales.pow(1 - alpha) to 1e-4')
w_scales_pow = w_scales_pow.clamp(min=1e-4)
scales = (act_scales.pow(alpha) / w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
# prevent scales.max() * scales.min() == inf
denom = (scales.max().float() * scales.min().float()).sqrt()
denom = denom.to(dtype=dtype)
scales = scales / denom

# (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
# phi3 fused qkv and gate_up
Expand Down
5 changes: 5 additions & 0 deletions lmdeploy/lite/quantization/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ def __init__(self,

def _guess_num_heads(self, model):

if hasattr(model.config, 'text_config'):
model.config = model.config.text_config
if hasattr(model.config, 'llm_config'):
model.config = model.config.llm_config
Comment on lines +83 to +86
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_guess_num_heads() mutates model.config by reassigning it to text_config / llm_config. This has side effects for the rest of calibration (e.g., later code uses model.config.hidden_size, use_cache, and config updates/saving) and can break models whose wrapper config contains fields not present on the nested config. Use a local variable (e.g., cfg = model.config and unwrap cfg), and leave model.config unchanged.

Copilot uses AI. Check for mistakes.

if hasattr(model.config, 'num_key_value_heads'):
num_kv_heads = model.config.num_key_value_heads
else:
Expand Down
4 changes: 0 additions & 4 deletions lmdeploy/lite/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,6 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, dtype: Literal['float
torch_dtype = torch.bfloat16
elif dtype == 'float16':
torch_dtype = torch.float16
elif dtype == 'auto' and torch_dtype == torch.bfloat16:
print('Warning: we cast model to float16 to prevent OOM. '
'You may enforce it bfloat16 by `--dtype bfloat16`')
torch_dtype = torch.float16

with LoadNoInit():
# Load model
Expand Down
Loading