diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index fee88c56a89..85ad84a17f4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP ```bash pip install neural-compressor-pt==3.7 # auto-round -pip install auto-round==0.9.2 +pip install auto-round==0.9.3 # vLLM git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv @@ -16,7 +16,7 @@ pip uninstall flash_attn ### Quantize Model - Export model path ```bash -export MODEL=deepseek-ai/DeepSeek-R1 +export MODEL=unsloth/DeepSeek-R1-BF1 ``` - MXFP8 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 9becc2cecf9..496a9f26e68 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -40,11 +40,12 @@ def get_model_and_tokenizer(model_name): fp32_model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", - trust_remote_code=True, + trust_remote_code=False, + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=False, ) return fp32_model, tokenizer @@ -68,6 +69,7 @@ def quant_model(args): fp_layers=config["fp_layers"], export_format=export_format, output_dir=output_dir, + reloading=False, ) # quantizer execute diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 1d805c7872b..d0039e5ecff 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -114,7 +114,6 @@ lm_eval --model vllm \ --tasks $TASK_NAME \ --batch_size $BATCH_SIZE \ --log_samples \ - --limit 64 \ --seed 42 \ --output_path ${OUTPUT_DIR} \ --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md index 14a04aa99e0..8f494b05a55 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX ```bash pip install neural-compressor-pt==3.7 # auto-round -pip install auto-round==0.9.2 +pip install auto-round==0.9.3 # vLLM git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py index 28b7e59b75d..24b6a762ff2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -62,11 +62,14 @@ def quant_model(args): quant_config = AutoRoundConfig( tokenizer=tokenizer, scheme=config["scheme"], - enable_torch_compile=args.enable_torch_compile, + enable_torch_compile=True, iters=config["iters"], fp_layers=config["fp_layers"], export_format=export_format, + disable_opt_rtn=True, + low_gpu_mem_usage=True, output_dir=output_dir, + reloading=False, ) # quantizer execute diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index 1d805c7872b..d0039e5ecff 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -114,7 +114,6 @@ lm_eval --model vllm \ --tasks $TASK_NAME \ --batch_size $BATCH_SIZE \ --log_samples \ - --limit 64 \ --seed 42 \ --output_path ${OUTPUT_DIR} \ --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt \ No newline at end of file