Skip to content

Commit 7ba9fdc

Browse files
authored
support gptq true_sequential and quant_lm_head (#1977)
Signed-off-by: Kaihui-intel <[email protected]>
1 parent 68b1f8b commit 7ba9fdc

File tree

8 files changed

+430
-99
lines changed

8 files changed

+430
-99
lines changed

.azure-pipelines/scripts/ut/run_itrex.sh

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ bash /intel-extension-for-transformers/.github/workflows/script/install_binary.s
1818
sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt
1919
pip install -r /intel-extension-for-transformers/tests/requirements.txt
2020
# workaround
21-
pip install onnx==1.15.0
21+
pip install onnx==1.16.0
22+
pip install onnxruntime==1.18.0
2223
echo "pip list itrex ut deps..."
2324
pip list
2425
LOG_DIR=/neural-compressor/log_dir

docs/source/3x/PT_WeightOnlyQuant.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ model = convert(model)
111111
| model_path (str) | Model path that is used to load state_dict per layer | |
112112
| use_double_quant (bool) | Enables double quantization | False |
113113
| act_order (bool) | Whether to sort Hessian's diagonal values to rearrange channel-wise quantization order | False |
114-
| percdamp (float) | Percentage of Hessian's diagonal values' average, which will be added to Hessian's diagonal to increase numerical stability | 0.01. |
114+
| percdamp (float) | Percentage of Hessian's diagonal values' average, which will be added to Hessian's diagonal to increase numerical stability | 0.01 |
115115
| block_size (int) | Execute GPTQ quantization per block, block shape = [C_out, block_size] | 128 |
116-
| static_groups (bool) | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements. | False. |
116+
| static_groups (bool) | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements. | False |
117+
| true_sequential (bool) | Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. | False |
117118
> **Note:** `model_path` is only used when use_layer_wise=True. `layer-wise` is stay-tuned.
118119
119120
``` python

neural_compressor/torch/algorithms/weight_only/gptq.py

+334-88
Large diffs are not rendered by default.

neural_compressor/torch/algorithms/weight_only/rtn.py

+2
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ def convert(
177177
if dtype != "int" and "int" in dtype:
178178
bits = int(dtype.lstrip("int"))
179179
dtype = "int"
180+
else:
181+
continue
180182
log_msg = (
181183
f"RTN quantization config: bits={bits}, group_size={group_size}, "
182184
+ f"scheme={scheme}, quantile={quantile}"

neural_compressor/torch/quantization/algorithm_entry.py

+2
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,13 @@ def gptq_entry(
159159
"percdamp": quant_config.percdamp,
160160
"block_size": quant_config.block_size,
161161
"static_groups": quant_config.static_groups,
162+
"true_sequential": quant_config.true_sequential,
162163
}
163164
kwargs.update(
164165
{
165166
"use_layer_wise": quant_config.use_layer_wise,
166167
"model_path": quant_config.model_path,
168+
"quant_lm_head": quant_config.quant_lm_head,
167169
}
168170
)
169171
kwargs.pop("example_inputs")

neural_compressor/torch/quantization/config.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ class GPTQConfig(TorchBaseConfig):
351351
"percdamp",
352352
"block_size",
353353
"static_groups",
354+
"true_sequential",
354355
]
355356

356357
def __init__(
@@ -376,6 +377,7 @@ def __init__(
376377
percdamp: float = 0.01,
377378
block_size: int = 2048,
378379
static_groups: bool = False,
380+
true_sequential: bool = False,
379381
# Tuning space
380382
white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
381383
):
@@ -404,10 +406,12 @@ def __init__(
404406
static_groups (bool): Whether to calculate group wise quantization parameters in advance.
405407
This option mitigate actorder's extra computational requirements.
406408
Default is False.
409+
true_sequential (bool): Whether to quantize layers within a transformer block in their original order.
410+
This can lead to higher accuracy but slower overall quantization process.
411+
Default is False.
407412
white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
408413
Default is DEFAULT_WHITE_LIST.
409414
"""
410-
assert not quant_lm_head, "GPTQ doesn't support lm_head quantization currently, it's coming soon!"
411415
super().__init__(white_list=white_list)
412416
self.dtype = dtype
413417
self.bits = bits
@@ -428,6 +432,7 @@ def __init__(
428432
self.percdamp = percdamp
429433
self.block_size = block_size
430434
self.static_groups = static_groups
435+
self.true_sequential = true_sequential
431436
self.quant_lm_head = quant_lm_head
432437
self._post_init() # initialize global & local configuration
433438

@@ -599,7 +604,7 @@ def __init__(
599604
double_quant_bits (int): Number of bits used to represent double_quant scale, default is 4.
600605
double_quant_use_sym (bool): Indicates whether double_quant scale are symmetric, default is True.
601606
double_quant_group_size (int): Size of double_quant groups, default is 32.
602-
quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers。 Default is False.
607+
quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformer, default is False.
603608
use_auto_scale (bool): Enables best scales search based on activation distribution, default is True.
604609
use_auto_clip (bool): Enables clip range search. Defaults to True.
605610
folding(bool): Allow insert mul before linear when the scale cannot be absorbed by last layer,

test/3x/torch/quantization/weight_only/test_gptq.py

+69-4
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,10 @@ def test_act_order(self):
182182
# compare atol, this case is an ideal case.
183183
assert atol_false > atol_true, "act_order=True doesn't help accuracy, maybe is reasonable, please double check."
184184

185-
def test_layer_wise(self):
185+
@pytest.mark.parametrize("quant_lm_head", [False, True])
186+
def test_layer_wise(self, quant_lm_head):
186187
model = copy.deepcopy(self.tiny_gptj)
187-
quant_config = GPTQConfig()
188+
quant_config = GPTQConfig(quant_lm_head=quant_lm_head)
188189
model = prepare(model, quant_config)
189190
run_fn(model)
190191
model = convert(model)
@@ -194,12 +195,76 @@ def test_layer_wise(self):
194195

195196
model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
196197

197-
quant_config = GPTQConfig(use_layer_wise=True, model_path="hf-internal-testing/tiny-random-GPTJForCausalLM")
198+
quant_config = GPTQConfig(
199+
use_layer_wise=True,
200+
quant_lm_head=quant_lm_head,
201+
model_path="hf-internal-testing/tiny-random-GPTJForCausalLM",
202+
)
203+
model = prepare(model, quant_config)
204+
run_fn(model)
205+
model = convert(model)
206+
out = model(self.example_inputs)[0]
207+
208+
# remove lwq tmp directory
209+
from neural_compressor.torch.algorithms.layer_wise.utils import LWQ_WORKSPACE
210+
211+
shutil.rmtree(LWQ_WORKSPACE, ignore_errors=True)
212+
assert torch.equal(
213+
out, q_label
214+
), f"use_layer_wise=True and quant_lm_head={quant_lm_head} output should be same. Please double check."
215+
216+
def test_true_sequential(self):
217+
# true_sequential=False
218+
model = copy.deepcopy(self.tiny_gptj)
219+
quant_config = GPTQConfig(
220+
true_sequential=False,
221+
)
222+
model = prepare(model, quant_config)
223+
run_fn(model)
224+
model = convert(model)
225+
out = model(self.example_inputs)[0]
226+
atol_false = (out - self.label).amax()
227+
# true_sequential=True
228+
model = copy.deepcopy(self.tiny_gptj)
229+
quant_config = GPTQConfig(
230+
true_sequential=True,
231+
)
232+
model = prepare(model, quant_config)
233+
run_fn(model)
234+
model = convert(model)
235+
out = model(self.example_inputs)[0]
236+
atol_true = (out - self.label).amax()
237+
# compare atol, this case is an ideal case.
238+
assert (
239+
atol_false < atol_true
240+
), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."
241+
242+
def test_quant_lm_head(self):
243+
# quant_lm_head=False
244+
model = copy.deepcopy(self.tiny_gptj)
245+
quant_config = GPTQConfig(
246+
quant_lm_head=False,
247+
)
198248
model = prepare(model, quant_config)
199249
run_fn(model)
200250
model = convert(model)
201251
out = model(self.example_inputs)[0]
202-
assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
252+
atol_false = (out - self.label).amax()
253+
# quant_lm_head=True
254+
model = copy.deepcopy(self.tiny_gptj)
255+
quant_config = GPTQConfig(
256+
quant_lm_head=True,
257+
)
258+
model = prepare(model, quant_config)
259+
run_fn(model)
260+
model = convert(model)
261+
out = model(self.example_inputs)[0]
262+
atol_true = (out - self.label).amax()
263+
# compare atol, this case is an ideal case.
264+
assert (
265+
atol_false < atol_true
266+
), "quant_lm_head=True doesn't help accuracy, maybe is reasonable, please double check."
267+
assert get_woq_linear_num(model, "INCWeightOnlyLinear") == 31, "Incorrect number of INCWeightOnlyLinear modules"
203268

204269
@pytest.mark.parametrize("dtype", ["nf4", "int4"])
205270
@pytest.mark.parametrize("double_quant_bits", [6])

test/3x/torch/quantization/weight_only/test_rtn.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,15 @@ def test_quant_lm_head(self):
174174
), "The tied lm_head weight is not deep copied, please check!"
175175

176176
def test_layer_wise(self):
177+
# use_layer_wise=False
178+
model = copy.deepcopy(self.tiny_gptj)
179+
quant_config = RTNConfig(
180+
use_layer_wise=False,
181+
)
182+
model = prepare(model, quant_config)
183+
model = convert(model)
184+
out0 = model(self.example_inputs)[0]
185+
177186
from neural_compressor.torch import load_empty_model
178187

179188
model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")
@@ -182,8 +191,8 @@ def test_layer_wise(self):
182191
)
183192
model = prepare(model, quant_config)
184193
model = convert(model)
185-
out = model(self.example_inputs)[0]
186-
assert torch.equal(out, self.q_label), "use_layer_wise=True output should be same. Please double check."
194+
out1 = model(self.example_inputs)[0]
195+
assert torch.equal(out1, out0), "use_layer_wise=True output should be same. Please double check."
187196

188197
@pytest.mark.parametrize(
189198
"dtype",

0 commit comments

Comments
 (0)