intel · chensuyue · Dec 12, 2025 · Dec 8, 2025 · Dec 9, 2025 · Dec 10, 2025
diff --git a/...huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/...huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
@@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
 ```bash
 pip install neural-compressor-pt==3.7
 # auto-round
-pip install auto-round==0.9.2
+pip install auto-round==0.9.3
 # vLLM
 git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
@@ -16,7 +16,7 @@ pip uninstall flash_attn
 ### Quantize Model
 - Export model path
 ```bash
-export MODEL=deepseek-ai/DeepSeek-R1
+export MODEL=unsloth/DeepSeek-R1-BF1
 ```
 
 - MXFP8

diff --git a/...rch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/...rch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -40,11 +40,12 @@ def get_model_and_tokenizer(model_name):
     fp32_model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="cpu",
-        trust_remote_code=True,
+        trust_remote_code=False,
+        dtype="auto",
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
-        trust_remote_code=True,
+        trust_remote_code=False,
     )
     return fp32_model, tokenizer
 
@@ -68,6 +69,7 @@ def quant_model(args):
         fp_layers=config["fp_layers"],
         export_format=export_format,
         output_dir=output_dir,
+        reloading=False,
     )
 
     # quantizer execute

diff --git a/...p/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/...p/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -114,7 +114,6 @@ lm_eval --model vllm \
   --tasks $TASK_NAME \
   --batch_size $BATCH_SIZE \
   --log_samples \
-  --limit 64 \
   --seed 42 \
   --output_path ${OUTPUT_DIR} \
   --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
diff --git a/...nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/...nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
@@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
 ```bash
 pip install neural-compressor-pt==3.7
 # auto-round
-pip install auto-round==0.9.2
+pip install auto-round==0.9.3
 # vLLM
 git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
 VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv

diff --git a/...pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/...pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
@@ -62,11 +62,14 @@ def quant_model(args):
     quant_config = AutoRoundConfig(
         tokenizer=tokenizer,
         scheme=config["scheme"],
-        enable_torch_compile=args.enable_torch_compile,
+        enable_torch_compile=True,
         iters=config["iters"],
         fp_layers=config["fp_layers"],
         export_format=export_format,
+        disable_opt_rtn=True,
+        low_gpu_mem_usage=True,
         output_dir=output_dir,
+        reloading=False,
     )
 
     # quantizer execute

diff --git a/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -114,7 +114,6 @@ lm_eval --model vllm \
   --tasks $TASK_NAME \
   --batch_size $BATCH_SIZE \
   --log_samples \
-  --limit 64 \
   --seed 42 \
   --output_path ${OUTPUT_DIR} \
   --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt