Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
```bash
pip install neural-compressor-pt==3.7
# auto-round
pip install auto-round==0.9.2
pip install auto-round==0.9.3
# vLLM
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
Expand All @@ -16,7 +16,7 @@ pip uninstall flash_attn
### Quantize Model
- Export model path
```bash
export MODEL=deepseek-ai/DeepSeek-R1
export MODEL=unsloth/DeepSeek-R1-BF1
```

- MXFP8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ def get_model_and_tokenizer(model_name):
fp32_model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="cpu",
trust_remote_code=True,
trust_remote_code=False,
dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
trust_remote_code=False,
)
return fp32_model, tokenizer

Expand All @@ -68,6 +69,7 @@ def quant_model(args):
fp_layers=config["fp_layers"],
export_format=export_format,
output_dir=output_dir,
reloading=False,
)

# quantizer execute
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ lm_eval --model vllm \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--limit 64 \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
```bash
pip install neural-compressor-pt==3.7
# auto-round
pip install auto-round==0.9.2
pip install auto-round==0.9.3
# vLLM
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,14 @@ def quant_model(args):
quant_config = AutoRoundConfig(
tokenizer=tokenizer,
scheme=config["scheme"],
enable_torch_compile=args.enable_torch_compile,
enable_torch_compile=True,
iters=config["iters"],
fp_layers=config["fp_layers"],
export_format=export_format,
disable_opt_rtn=True,
low_gpu_mem_usage=True,
output_dir=output_dir,
reloading=False,
)

# quantizer execute
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ lm_eval --model vllm \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--limit 64 \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt