From d4ac0efefcf167cea98dd29cade799526fcfd69f Mon Sep 17 00:00:00 2001 From: ceci3 Date: Tue, 19 Dec 2023 18:42:50 +0800 Subject: [PATCH 1/9] support nf4 channel wise quant & fix bug when blocksize>512 (#1817) (#1818) --- csrc/lc/dequantize_blockwise.cu | 84 ++++++++++++++++++++--- csrc/lc/quantize_blockwise.cu | 115 ++++++++++++++++++++++++-------- 2 files changed, 162 insertions(+), 37 deletions(-) diff --git a/csrc/lc/dequantize_blockwise.cu b/csrc/lc/dequantize_blockwise.cu index 8046c34ac..0bf76a163 100644 --- a/csrc/lc/dequantize_blockwise.cu +++ b/csrc/lc/dequantize_blockwise.cu @@ -201,7 +201,6 @@ template __global__ void kDequantizeBlockwise(const floa //template __global__ void kDequantizeBlockwise<__nv_bfloat16, 512, 64, 8, NF4>(const float *code, const unsigned char * A, const float * absmax, __nv_bfloat16 *out, int blocksize, int n); - template void dequantize_blockwise(const float *code, const unsigned char *A, const float *absmax, T *out, int blocksize, int n) { int num_blocks = n/blocksize; @@ -226,6 +225,50 @@ template void dequantize_blockwise(const float *code, const unsigned //template void dequantize_blockwise<__nv_bfloat16, FP4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); //template void dequantize_blockwise<__nv_bfloat16, NF4>(const float *code, const unsigned char *A, const float *absmax, __nv_bfloat16 *out, int blocksize, int n); +template +__global__ void kDequantizeChannelwise(const unsigned char* A, + const float *absmax, + float *out, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + //int part_n = num / cout; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + float local_absmax = absmax[i%cout]; + int idx = 2*(i/cout)* cout + i%cout; + switch(DATA_TYPE) + { + case FP4: + out[i*2 + i%cout] = dDequantizeFP4Tree(A[i] >> 4, local_absmax); + out[i*2 + cout + i%cout] = dDequantizeFP4Tree(A[i] & 0x0F, local_absmax); + break; + case NF4: + out[idx] = dDequantizeNF4(A[i] >> 4)* local_absmax; + out[idx + cout] = dDequantizeNF4(A[i] & 0x0F)* local_absmax; + break; + } + __syncthreads(); + } +} + +template void dequantize_channelwise(const unsigned char *A, const float *absmax, T *out, int n, int cout) +{ + int max_threads = 1024; + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kDequantizeChannelwise<<>>(A, absmax, out, n, cout); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); +} + std::vector DequantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, const paddle::Tensor& absmax, int blocksize, std::string quant_type) { int64_t input_numel = input.numel(); int n = input_numel; @@ -234,23 +277,44 @@ std::vector DequantizeBlockwise(const paddle::Tensor& input, con out_shape = {input_numel * 2, 1}; n = n * 2; } + if (blocksize == -1) { + out_shape = {input.shape()[0] * 2, input.shape()[1]}; + } auto out = paddle::empty(out_shape, paddle::DataType::FLOAT32, input.place()); - if (quant_type == "8bit") - dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "nf4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else if (quant_type == "fp4") - dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); - else - PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + if (blocksize == -1) { + if (quant_type == "8bit") + PD_THROW("blocksize is -1 only support NF4 and FP4."); + else + blocksize = n / absmax.numel() * 2; + + int cout = input.shape()[1]; + if (quant_type == "nf4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else if (quant_type == "fp4") + dequantize_channelwise(input.data(), absmax.data(), out.data(), n, cout); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } else { + if (quant_type == "8bit") + dequantize_blockwise(code.data(), input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "nf4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else if (quant_type == "fp4") + dequantize_blockwise(NULL, input.data(), absmax.data(), out.data(), blocksize, n); + else + PD_THROW("NOT supported quant type. Only 8bit, nf4, fp4 are supported. "); + } return {out}; }; std::vector> GetDequantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, const std::vector& abs_max_shape, int blocksize, std::string quant_type){ int64_t first_shape = input_shape[0] * input_shape[1] * 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0] * 2, input_shape[1]}}; else return {input_shape}; } diff --git a/csrc/lc/quantize_blockwise.cu b/csrc/lc/quantize_blockwise.cu index d4f6ff2ca..e8e55b9d8 100644 --- a/csrc/lc/quantize_blockwise.cu +++ b/csrc/lc/quantize_blockwise.cu @@ -279,6 +279,7 @@ __global__ void kQuantizeBlockwise(const float * code, const T * __restrict__ A, #pragma unroll NUM_PER_TH for(int j = 0; j < NUM_PER_TH/2; j++) { + packed_4bit = 0; packed_4bit |= dQuantizeNF4(((float)vals[2*j])*local_abs_max) << 4; packed_4bit |= dQuantizeNF4(((float)vals[2*j+1])*local_abs_max); qvals[j] = packed_4bit; @@ -360,9 +361,39 @@ MAKE_kQuantizeBlockwise(__nv_bfloat16, 256, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 128, 2, NF4) MAKE_kQuantizeBlockwise(__nv_bfloat16, 64, 2, NF4) +template +__global__ void kQuantizeChannelwise(const float *code, + const T* A, + unsigned char* out, + float *absmax, + int n, + int cout) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + int num = n / 2; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + int idx = 2*(i/cout)* cout + i%cout; + float local_absmax = absmax[i %cout]; + float inv_local_absmax = 1.0f/local_absmax; + unsigned char packed_4bit = 0; + switch(DATA_TYPE) + { + case FP4: + packed_4bit |= dQuantizeFP4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeFP4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + case NF4: + packed_4bit |= dQuantizeNF4(((float)A[idx])*inv_local_absmax) << 4; + packed_4bit |= dQuantizeNF4(((float)A[idx+cout])*inv_local_absmax); + out[i] = packed_4bit; + break; + } + } +} -template void quantize_blockwise(const float *code, const paddle::Tensor& A, float *absmax, unsigned char *out, int blocksize, int n) +template void quantize_blockwise(const float *code, const paddle::Tensor& A, paddle::Tensor& absmax, unsigned char *out, int blocksize, int n, int channelwise) { typedef PDTraits traits_; typedef typename traits_::DataType DataType_; @@ -372,22 +403,43 @@ template void quantize_blockwise(const float num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1; const DataType_* A_data = reinterpret_cast(A.data()); - if(blocksize == 4096) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 2048) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 1024) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 512) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 256) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 128) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else if(blocksize == 64) - kQuantizeBlockwise<<>>(code, A_data, absmax, out, n); - else - PD_THROW("only support blocksize is [64, 128, 256, 512, 1024, 2048, 4096]."); + if (channelwise == 0) { + if(blocksize == 4096) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 2048) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 1024) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 512) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 256) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 128) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + else if(blocksize == 64) + kQuantizeBlockwise<<>>(code, A_data, absmax.data(), out, n); + } + else { + if (DATA_TYPE == General8bit) + PD_THROW("blocksize is -1 only support NF4 and FP4."); + + int cout = A.shape()[1]; + int max_threads = 1024; + + absmax = A.abs().max({0}); + + int64_t block_size = + std::min(static_cast(n), + static_cast(max_threads/ 4)); + + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (n + block_size - 1) / block_size); + + kQuantizeChannelwise<<>>( + code, A_data, out, absmax.data(), n, cout); + } CUDA_CHECK_RETURN(cudaPeekAtLastError()); @@ -395,38 +447,44 @@ template void quantize_blockwise(const float std::vector QuantizeBlockwise(const paddle::Tensor& input, const paddle::Tensor& code, int blocksize, std::string quant_type) { int n = input.numel(); + int channelwise = 0; std::vector out_shape = input.shape(); if (quant_type != "8bit") { // 4bit out_shape = {(n + 1) / 2, 1}; } + if (blocksize == -1){ + blocksize = input.shape()[0]; + out_shape = {input.shape()[0]/2, input.shape()[1]}; + channelwise = 1; + } auto out = paddle::empty(out_shape, paddle::DataType::UINT8, input.place()); int64_t absmax_shape = n / blocksize; auto absmax = paddle::empty({absmax_shape}, paddle::DataType::FLOAT32, input.place()); switch(input.type()) { case paddle::DataType::FLOAT32: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") { - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); } else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::FLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; case paddle::DataType::BFLOAT16: if (quant_type == "8bit") - quantize_blockwise(code.data(), input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(code.data(), input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "nf4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); else if (quant_type == "fp4") - quantize_blockwise(NULL, input, absmax.data(), out.data(), blocksize, n); + quantize_blockwise(NULL, input, absmax, out.data(), blocksize, n, channelwise); return {out, absmax}; default: @@ -440,7 +498,10 @@ std::vector QuantizeBlockwise(const paddle::Tensor& input, const std::vector> GetQuantizeBlockwiseInferShape(const std::vector& input_shape, const std::vector& code_shape, int blocksize, std::string quant_type){ int64_t first_shape = (input_shape[0] * input_shape[1] + 1) / 2; if (quant_type != "8bit") - return {{first_shape, 1}}; + if (blocksize != -1) + return {{first_shape, 1}}; + else + return {{input_shape[0]/2, input_shape[1]}}; else return {input_shape}; } From dcf79e930694beded6965a21220f7001551f21eb Mon Sep 17 00:00:00 2001 From: Chang Xu Date: Tue, 26 Dec 2023 15:52:31 +0800 Subject: [PATCH 2/9] Add GroupWiseQuant & AWQ & AutoClip (#1821) --- paddleslim/quant/advanced/__init__.py | 8 +- paddleslim/quant/advanced/auto_clip.py | 155 ++++++++++++++++++ paddleslim/quant/advanced/awq_search.py | 78 +++++++++ paddleslim/quant/advanced/piecewise_search.py | 39 +++-- paddleslim/quant/advanced/smooth.py | 56 ++++--- paddleslim/quant/advanced/utils.py | 20 ++- paddleslim/quant/observers/__init__.py | 2 + paddleslim/quant/observers/groupwise.py | 112 +++++++++++++ 8 files changed, 428 insertions(+), 42 deletions(-) create mode 100644 paddleslim/quant/advanced/auto_clip.py create mode 100644 paddleslim/quant/advanced/awq_search.py create mode 100644 paddleslim/quant/observers/groupwise.py diff --git a/paddleslim/quant/advanced/__init__.py b/paddleslim/quant/advanced/__init__.py index 1f0744ecf..2e779a6e1 100644 --- a/paddleslim/quant/advanced/__init__.py +++ b/paddleslim/quant/advanced/__init__.py @@ -19,6 +19,8 @@ from . import sample from . import layerwise_quant_error from . import utils_layers +from . import awq_search +from . import auto_clip from .gptq import * from .smooth import * @@ -27,6 +29,8 @@ from .sample import * from .layerwise_quant_error import * from .utils_layers import * +from .awq_search import * +from .auto_clip import * __all__ = [] __all__ += gptq.__all__ @@ -35,4 +39,6 @@ __all__ += piecewise_search.__all__ __all__ += sample.__all__ __all__ += layerwise_quant_error.__all__ -__all__ += utils_layers.__all__ \ No newline at end of file +__all__ += utils_layers.__all__ +__all__ += awq_search.__all__ +__all__ += auto_clip.__all__ \ No newline at end of file diff --git a/paddleslim/quant/advanced/auto_clip.py b/paddleslim/quant/advanced/auto_clip.py new file mode 100644 index 000000000..696901110 --- /dev/null +++ b/paddleslim/quant/advanced/auto_clip.py @@ -0,0 +1,155 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +from .utils import fake_quant +from .metrics import mse_loss +from paddle.distributed.fleet.meta_parallel import ( + ColumnParallelLinear, + RowParallelLinear, +) +__all__ = ['AutoClip'] + +class AutoClip(nn.Layer): + """ + AutoClip from AWQ[https://arxiv.org/abs/2306.00978] + """ + def __init__( + self, + model, + weight_bits=8, + weight_quant_method='groupwise', + loss_function=mse_loss, + sample_function=None, + n_grid=20, + max_shrink=0.5, + n_sample_token=128, + group_size=-1, + ): + super(AutoClip, self).__init__() + self.model = model + self.weight_bits = weight_bits + self.weight_method = weight_quant_method + self.loss_function = loss_function + self.n_grid = n_grid + self.max_shrink = max_shrink + self.n_sample_token = n_sample_token + self.bnt = (1 << (self.weight_bits - 1)) - 1 + self.sampled_inputs = {} + self.sample_function = sample_function + self.group_size = group_size + + self._apply_hook() + + def _apply_hook(self): + self._forward_hook_list = [] + for _, sub_layer in self.model.named_sublayers(): + if type(sub_layer) in [ColumnParallelLinear, RowParallelLinear, paddle.nn.Linear]: + forward_pre_hook_handle = sub_layer.register_forward_pre_hook( + self._forward_pre_hook) + self._forward_hook_list.append(forward_pre_hook_handle) + + def _forward_pre_hook(self, layer, input): + self._sample_scale(input, layer.full_name()) + return input + + def _sample_scale(self, input, name): + input = input[0] if type(input) == tuple else input + input.stop_gradient = True + if name not in self.sampled_inputs: + self.sampled_inputs[name] = input + else: + if self.sample_function is not None: + self.sampled_inputs[name] = self.sample_function.sample( + input, self.sampled_inputs[name], name) + else: + self.sampled_inputs[name] = input + + + def auto_clip(self, group_size=128, oc_batch_size=1024): + """ + search clip scale for each layer and update the layer's weight + """ + for sub_name, sub_layer in self.model.named_sublayers(): + name = sub_layer.full_name() + if name not in self.sampled_inputs: + continue + print('AutoClipping', sub_name, name) + weight = sub_layer.weight.cast('float16') + weight_t = paddle.transpose(weight, perm=[1, 0]) + x = self.sampled_inputs[name].cast('float16') + x = x.reshape([-1, x.shape[-1]]) + x = x.reshape([1, x.shape[0], -1, group_size]) + x = x[:, 0::x.shape[1] // self.n_sample_token] + weight_t = weight_t.reshape([weight_t.shape[0], 1, -1, group_size]) + # fast test + # oc_batch_size = weight_t.shape[0] // 4 + oc_batch_size = oc_batch_size if weight_t.shape[0] % oc_batch_size == 0 else 128 # prevent OOM + assert weight_t.shape[0] % oc_batch_size == 0 + + w_all = weight_t + best_max_val_all = [] + + for i_b in range(weight_t.shape[0] // oc_batch_size): + w = w_all[i_b * oc_batch_size: (i_b + 1) * oc_batch_size] + + org_max_val = w.abs().max(axis=-1, keepdim=True) # co, 1, n_group, 1 + best_max_val = org_max_val.clone() + min_errs = paddle.ones_like(org_max_val, dtype='float16') * 1e9 + org_out = (x * w).sum(axis=-1) # co, n_token, n_group + for i_s in range(int(self.max_shrink * self.n_grid)): + max_val = org_max_val * (1 - i_s / self.n_grid) + max_val_tmp = max_val + cur_w = paddle.where(w > max_val_tmp, max_val_tmp, w) + cur_w = paddle.where(cur_w < - max_val_tmp, - max_val_tmp, cur_w) + quant_dequant_weight = fake_quant(cur_w, method='abs_max', weight_bits=4) + cur_out = (x * quant_dequant_weight).sum(axis=-1) + # co, 1, n_group, 1 + tmp = (cur_out - org_out).detach().clone() + err = paddle.pow(tmp, 2).mean(axis=1).reshape(min_errs.shape) + print('block {} search s {} err {}'.format(i_b, i_s, err.mean().item())) + del cur_w, cur_out, quant_dequant_weight, tmp + paddle.device.cuda.empty_cache() + + cur_best_idx = paddle.where(err < min_errs) + if cur_best_idx[0].shape[0] != 0: + min_errs[cur_best_idx] = err[cur_best_idx] + best_max_val[cur_best_idx] = max_val[cur_best_idx] + best_max_val_all.append(best_max_val) + + del org_out, org_max_val, min_errs, best_max_val, err, cur_best_idx, max_val_tmp, max_val, w + paddle.device.cuda.empty_cache() + + best_max_val = paddle.concat(best_max_val_all, axis=0) + best_max_val = paddle.squeeze(best_max_val, axis=1) + for param in sub_layer.parameters(include_sublayers=False): + if 'w_0' in param.name: + param_tmp = param.transpose(perm=[1, 0]).cast('float16') + tmp_shape = param_tmp.shape + param_tmp = param_tmp.reshape([best_max_val.shape[0], best_max_val.shape[1], -1]) + best_max_val = paddle.tile(best_max_val, repeat_times=(1, 1, param_tmp.shape[-1])) + param_tmp = paddle.where(param_tmp > best_max_val, best_max_val, param_tmp) + param_tmp = paddle.where(param_tmp < - best_max_val, - best_max_val, param_tmp) + param_tmp = param_tmp.reshape(tmp_shape).cast(param.dtype) + param_tmp = param_tmp.transpose(perm=[1, 0]) + paddle.assign(param_tmp, output=param) + del param_tmp + paddle.device.cuda.empty_cache() + break + + del best_max_val, weight_t, x, weight, self.sampled_inputs[name], w_all, best_max_val_all + paddle.device.cuda.empty_cache() + diff --git a/paddleslim/quant/advanced/awq_search.py b/paddleslim/quant/advanced/awq_search.py new file mode 100644 index 000000000..55151c4e8 --- /dev/null +++ b/paddleslim/quant/advanced/awq_search.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import numpy as np +from .utils import compute_scales +from .metrics import mse_loss +__all__ = ['AWQSearch'] + +class AWQSearch(): + def __init__(self, + n_grid=20, + bits_length=4, + weight_quant_method='groupwise', + group_size=128, + loss_function=mse_loss): + ''' + The implementation of AutoScale from AWQ(https://arxiv.org/pdf/2306.00978.pdf). + ''' + self.n_grid = n_grid + self.bits_length = bits_length + self.weight_quant_method = weight_quant_method + self.bnt = (1 << (bits_length - 1)) - 1 + self.group_size = group_size + self.loss_function = loss_function + + def search(self, layer_name, sampled_input, act_abs_max, weight): + act = sampled_input + act.stop_gradient = True + print('[awq search] search input of %s' % layer_name) + dtype = weight.dtype + origin_out = paddle.matmul(act, weight) + best_error = float('inf') + best_ratio = -1 + best_scales = None + + for ratio in range(self.n_grid): + ratio = ratio * 1 / self.n_grid + act_abs_max_tmp = act_abs_max.detach().clone().cast('float32') + scales = paddle.clip(paddle.pow(act_abs_max_tmp, ratio), min=1e-4) + scales = scales / (scales.max() * scales.min()).sqrt() + scales = scales.cast(dtype) + new_weight = weight * scales.reshape([-1, 1]) + new_act = act / scales + quant_scale = compute_scales( + new_weight, method=self.weight_quant_method, group_size=self.group_size) + if self.weight_quant_method == 'groupwise': + quant_scale = paddle.repeat_interleave(quant_scale.cast('float32'), self.group_size, 0).cast(dtype) + quant_weight = paddle.clip( + paddle.round(new_weight / quant_scale * self.bnt), + -self.bnt - 1, self.bnt) + quant_dequant_weight = quant_weight / self.bnt * quant_scale + new_out = paddle.matmul(new_act, + quant_dequant_weight) + loss = self.loss_function(origin_out, new_out).numpy() + is_best = loss < best_error + if is_best: + print('find better ratio: {}, loss: {}'.format(ratio, loss)) + best_error = loss + best_ratio = ratio + best_scales = scales + + if best_scales is None: + best_scales = paddle.ones(scales.shape, dtype=dtype) + print('Cannot find better ratio.') + else: + print('Best ratio :{}, minimal loss : {}.'.format(best_ratio, best_error)) + return best_scales diff --git a/paddleslim/quant/advanced/piecewise_search.py b/paddleslim/quant/advanced/piecewise_search.py index 55678409b..e326f2e55 100644 --- a/paddleslim/quant/advanced/piecewise_search.py +++ b/paddleslim/quant/advanced/piecewise_search.py @@ -31,6 +31,8 @@ def __init__(self, search_scale_max=5., weight_quant_method='abs_max_channel_wise', act_quant_method='abs_max', + use_clip=False, + search_clip=False, loss_function=mse_loss): ''' PieceWiseSearch provides to search k_piece, alpha and scale. @@ -58,31 +60,36 @@ def __init__(self, self.act_quant_method = act_quant_method self.bnt = (1 << (bits_length - 1)) - 1 self.loss_function = loss_function + self.use_clip = use_clip + self.search_clip = search_clip def search(self, layer_name, sampled_input, act_abs_max, weight): act = sampled_input act.stop_gradient = True print('[smooth search] search input of %s' % layer_name) - + dtype = weight.dtype origin_out = paddle.matmul(act, weight) w_abs_max = weight.abs().max(axis=-1, keepdim=True) rw_abs_max = w_abs_max.reshape(act_abs_max.shape) - np_act_abs_max = np.array(act_abs_max) - np_rw_abs_max = np.array(rw_abs_max) - + smooth_scale_out = None global_loss = float('inf') best_scale = None - for k_piece in range(1, self.k_piece + 1): + if self.search_clip: + piece_range = [1] + list(range(1, self.k_piece + 1)) + else: + piece_range = list(range(1, self.k_piece + 1)) + + for k_idx, k_piece in enumerate(piece_range): if not self.search_piece: k_piece = self.k_piece print('Search {} Piece'.format(k_piece)) centroids, labels = k_means(act_abs_max, k_piece) piece = ['piece_{}'.format(a) for a in range(len(centroids))] for i in range(len(centroids)): - # print('search for piece {}; centroids value is {}'.format( - # piece[i], centroids[centroids.argsort()[i]].numpy())) + print('search for piece {}; centroids value is {}'.format( + piece[i], float(centroids[centroids.argsort()[i: i + 1]].cast('float32')))) alpha = self.search_alpha_min alpha_max = self.search_scale_max if self.search_scale_max is not None else self.search_alpha_max calibration_loss = float('inf') @@ -104,12 +111,16 @@ def search(self, layer_name, sampled_input, act_abs_max, weight): alpha = round(alpha, 2) if alpha < 1: - s = (np.power(np_act_abs_max, alpha) / np.power( - np_rw_abs_max, 1. - alpha)).clip(min=1e-5) - s = paddle.to_tensor(s, dtype='float32') + act_abs_max_tmp = act_abs_max.detach().clone() + s = paddle.clip(paddle.pow(act_abs_max_tmp, alpha) / paddle.pow( + rw_abs_max, 1 - alpha), min=1e-5) + + if self.use_clip or (k_piece == 1 and k_idx == 1 and self.search_clip): + s = paddle.clip(act_abs_max_tmp / paddle.max(act_abs_max / s), min=1) + del act_abs_max_tmp smooth_scale = s * mask_for_search else: - smooth_scale = alpha * mask_for_search + smooth_scale = paddle.to_tensor(alpha, dtype=dtype) * mask_for_search if smooth_scale_out is not None: mask_for_ones_new = paddle.where( @@ -145,9 +156,10 @@ def search(self, layer_name, sampled_input, act_abs_max, weight): calibration_loss = cur_loss final_smooth_scale = smooth_scale final_alpha = alpha + # print('Better alpha: {} loss: {}'.format(alpha, calibration_loss.cast('float32'))) - # print("Layer {} Piece {}, loss: {}, alpha : {}".format( - # layer_name, piece[i], float(calibration_loss), final_alpha)) + print("Layer {} Piece {}, loss: {}, alpha : {}".format( + layer_name, piece[i], float(calibration_loss.cast('float32')), final_alpha)) if smooth_scale_out is None: smooth_scale_out = final_smooth_scale else: @@ -160,4 +172,5 @@ def search(self, layer_name, sampled_input, act_abs_max, weight): print('Find Better K-Piece {}'.format(k_piece)) if not self.search_piece: break + return best_scale diff --git a/paddleslim/quant/advanced/smooth.py b/paddleslim/quant/advanced/smooth.py index e715788ed..5e32435f5 100644 --- a/paddleslim/quant/advanced/smooth.py +++ b/paddleslim/quant/advanced/smooth.py @@ -26,6 +26,8 @@ def __init__( model_config, alpha=0.5, smooth_all_linears=False, + start_sample_step=10000, + smooth_method='smoothquant', sample_function=None, search_function=None, ): ''' @@ -68,6 +70,8 @@ def __init__( self.smooth_all_linears = smooth_all_linears self.sample_function = sample_function self.search_function = search_function + self.start_sample_step = start_sample_step + self.smooth_method = smooth_method self.model.eval() self.step = 0 @@ -98,7 +102,6 @@ def _get_smooth_layers(self): self.ln_linear_dict, self.linear_ln_dict = get_ln_linear_info( self.layer_order, self.norm_flag, self.linear_flag, self.fused_qkv, self.parallel_ffn, self.skip_norm_list) - assert len(self.ln_linear_dict) > 0, 'No LN/Linear pair found' for key in self.ln_linear_dict: print('smooth pair LN {} : Linear {}'.format( @@ -147,29 +150,32 @@ def _forward_pre_hook(self, layer, input): def _sample_scale(self, input, ln_name): x = input[0] if type(input) == tuple else input x.stop_gradient = True - x_abs_max = x.abs().max(axis=1, keepdim=True) - x_abs_max = x_abs_max.max(axis=0) + + if self.smooth_method == 'smoothquant': + x_abs_max = x.abs().max(axis=1, keepdim=True) + x_abs_max = x_abs_max.max(axis=0) + elif self.smooth_method == 'awq': + x_abs_max = x.abs().reshape([-1, x.shape[-1]]) + x_abs_max = x_abs_max.mean(axis=0).reshape([1, -1]) + else: + raise NotImplementedError("To be implemented") if ln_name not in self.scale_dict: self.sampled_inputs[ln_name] = x self.scale_dict[ln_name] = x_abs_max else: - if self.sample_function is not None: + if self.sample_function is not None and self.step >= self.start_sample_step: self.sampled_inputs[ln_name] = self.sample_function.sample( x, self.sampled_inputs[ln_name], ln_name) else: self.sampled_inputs[ln_name] = x - tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0) - self.scale_dict[ln_name] = tmp1.max(axis=0, keepdim=True) + if self.smooth_method == 'smoothquant': + tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0) + self.scale_dict[ln_name] = tmp1.max(axis=0, keepdim=True) + elif self.smooth_method == 'awq': + tmp1 = paddle.concat([x_abs_max, self.scale_dict[ln_name]], axis=0) + self.scale_dict[ln_name] = tmp1.mean(axis=0, keepdim=True) - # per step print once - if self.print_step == self.step: - print('[Smooth] Step [{}]: {}. abs_min: {}, abs_max: {}'.format( - self.step, ln_name, - float(self.scale_dict[ln_name].cast("float32").min()), - float(self.scale_dict[ln_name].cast("float32").max()))) - if ln_name == list(self.linear_ln_dict.values())[-1]: - self.print_step += 1 def update_weight(self): @@ -181,24 +187,20 @@ def update_weight(self): if type(sub_layer) == ShiftSmoothHelpLayer: ln_name = layer_name if ln_name is not None: - act_abs_max = self.scale_dict[ln_name].cast("float32") - sampled_input = self.sampled_inputs[ln_name].cast("float32") + act_abs_max = self.scale_dict[ln_name].cast("float16") + sampled_input = self.sampled_inputs[ln_name].cast("float16") for param in sub_layer.parameters(include_sublayers=False): if 'w_0' in param.name: - weight = param.cast("float32") + # weight = param.cast("float32") if self.search_function is not None: s = self.search_function.search( - layer_name, sampled_input, act_abs_max, weight) + layer_name, sampled_input, act_abs_max, param.cast("float16")) else: - w_abs_max = weight.abs().max(axis=-1, keepdim=True) + w_abs_max = param.abs().max(axis=-1, keepdim=True) rw_abs_max = w_abs_max.reshape(act_abs_max.shape) - act_abs_max_np = act_abs_max.numpy() - weight_abs_max_np = rw_abs_max.numpy() - s = ( - np.power(act_abs_max_np, self.alpha) / np.power( - weight_abs_max_np, 1 - self.alpha)).clip( - min=1e-5) - s = paddle.to_tensor(s, dtype="float32") + act_abs_max_tmp = act_abs_max.detach().clone() + s = paddle.clip(paddle.pow(act_abs_max_tmp, self.alpha) / paddle.pow( + rw_abs_max, 1 - self.alpha), min=1e-5) self.smooth_scale_dict[ln_name] = s.cast(param.dtype) break @@ -273,4 +275,4 @@ def update_weight(self): def _remove_hook(self): for hook in self._forward_hook_list: hook.remove() - self._forward_hook_list = [] + self._forward_hook_list = [] \ No newline at end of file diff --git a/paddleslim/quant/advanced/utils.py b/paddleslim/quant/advanced/utils.py index 703fc5e1c..ff77462b2 100644 --- a/paddleslim/quant/advanced/utils.py +++ b/paddleslim/quant/advanced/utils.py @@ -38,7 +38,7 @@ def k_means(weight, n_clusters, init='k-means++', max_iter=300): return paddle.to_tensor(centroids.flatten()), paddle.to_tensor(labels) -def compute_scales(x, method='abs_max'): +def compute_scales(x, method='abs_max', group_size=-1): if method == 'abs_max': quant_scale = float(paddle.max(paddle.abs(x.flatten()))) quant_scale = 1e-8 if quant_scale == 0.0 else quant_scale @@ -52,8 +52,26 @@ def compute_scales(x, method='abs_max'): 0, dtype=x.dtype), paddle.to_tensor(1e-8, dtype=x.dtype), quant_scale) + elif method == 'groupwise': + input_shape = x.shape + input_processed = x.transpose([1, 0]).reshape( + [input_shape[1], input_shape[0] // group_size, group_size]) + quant_scale = paddle.max( + paddle.abs(input_processed), axis=2) + quant_scale = paddle.where(quant_scale == paddle.to_tensor(0, dtype=x.dtype), + paddle.to_tensor(1e-8, dtype=x.dtype), quant_scale) + quant_scale = quant_scale.transpose([1, 0]) + return quant_scale +def fake_quant(x, method='abs_max', weight_bits=8, group_size=-1): + bnt = (1 << (weight_bits - 1)) - 1 + quant_scale = compute_scales(x, method=method, group_size=group_size) + quant_value = paddle.clip( + paddle.round(x / quant_scale * bnt), -bnt - 1, bnt) + quant_dequant_value = quant_value / bnt * quant_scale + return quant_dequant_value + def find_parent_layer_and_sub_name(model, name): last_idx = 0 diff --git a/paddleslim/quant/observers/__init__.py b/paddleslim/quant/observers/__init__.py index 7ab3b723e..0b7970ba8 100644 --- a/paddleslim/quant/observers/__init__.py +++ b/paddleslim/quant/observers/__init__.py @@ -20,6 +20,7 @@ from .abs_max import AbsmaxObserver from .mse_weight import MSEChannelWiseWeightObserver from .abs_max_weight import AbsMaxChannelWiseWeightObserver +from .groupwise import GroupWiseWeightObserver __all__ = [ "HistObserver", @@ -31,4 +32,5 @@ "AbsmaxObserver", "MSEChannelWiseWeightObserver", "AbsMaxChannelWiseWeightObserver", + "GroupWiseWeightObserver" ] diff --git a/paddleslim/quant/observers/groupwise.py b/paddleslim/quant/observers/groupwise.py new file mode 100644 index 000000000..1db2067c6 --- /dev/null +++ b/paddleslim/quant/observers/groupwise.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from .channel_wise import ChannelWiseObserver +from paddle.quantization.factory import ObserverFactory + + +class GroupWiseWeightObserver(ObserverFactory): + r""" + It collects channel-wise maximum absolute values of target weights. + Args: + bit_length(int, optional): Number of bits to represent an quantized integer in binary. + dtype(str, optional): The data type of input tensor. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + Examples: + .. code-block:: python + from paddle.quantization import QuantConfig + from paddle.quantization.quanters import AbsMaxChannelWiseWeightObserver + quanter = AbsMaxChannelWiseWeightObserver() + q_config = QuantConfig(activation=None, weight=quanter) + """ + + def __init__(self, quant_bits=8, group_size=128): + super(GroupWiseWeightObserver, self).__init__( + quant_bits=quant_bits, + group_size=group_size) + + def _get_class(self): + return GroupWiseWeightObserverLayer + + +class GroupWiseWeightObserverLayer(ChannelWiseObserver): + def __init__(self, layer, quant_bits=8, group_size=128): + super(GroupWiseWeightObserverLayer, self).__init__( + layer, + quant_bits=quant_bits, + sign=True, + symmetric=True, ) + self.quant_bits = quant_bits + self.group_size = group_size + self.qmin, self.qmax = self.qmin_qmax + self._layer = layer + self._max = None + self._scale = None + self._zero_point = None + + def forward(self, inputs): + self._max = self._cal_abs_max(inputs) + return inputs + + def _cal_abs_max(self, inputs): + """ Use group_size to group the input, then use the + absmax method to calculate the scale + """ + input_shape = inputs.shape + assert self.group_size == 64 or self.group_size == 128, \ + "group_size only support 64 or 128" + assert inputs.shape[0] % self.group_size == 0, \ + "group_size must be a factor of input channels" + assert len(inputs.shape) == 2, \ + "Currently only support 2D tensor" + input_processed = inputs.transpose([1, 0]).reshape( + [input_shape[1], input_shape[0] // self.group_size, self.group_size]) + + abs_max_values = paddle.max( + paddle.abs(input_processed), axis=2).cast("float32") + # "abs_max_values < 1e-8" in bfloat16 type? + abs_max_values = paddle.where(abs_max_values == np.float32(0), + np.float32(1e-8), abs_max_values) + abs_max_values = abs_max_values.transpose([1, 0]) + return abs_max_values + + def min_value(self) -> float: + return 0. + + def max_value(self) -> float: + return self._max + + def cal_thresholds(self): + """ Compute thresholds for MAX function. + """ + if self._scale is None: + self._scale = self._max + self._zero_point = paddle.zeros_like(self._scale) + + def scales(self): + """ Return output scales. + """ + if self._scale is None: + self.cal_thresholds() + return self._scale + + def zero_points(self): + """ Return output zero points. + """ + if self._zero_point is None: + self.cal_thresholds() + return self._zero_point From 521157e390aa8bca62953e251495257e334a9477 Mon Sep 17 00:00:00 2001 From: Chang Xu Date: Thu, 28 Dec 2023 21:36:33 +0800 Subject: [PATCH 3/9] [Cherry-Pick]Cp fit paddle26 (#1823) --- paddleslim/quant/advanced/gptq.py | 19 +++++++++++++------ paddleslim/quant/advanced/piecewise_search.py | 3 +++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/paddleslim/quant/advanced/gptq.py b/paddleslim/quant/advanced/gptq.py index 96566858f..5ae47205c 100644 --- a/paddleslim/quant/advanced/gptq.py +++ b/paddleslim/quant/advanced/gptq.py @@ -106,8 +106,9 @@ def fasterquant(self, H = self.hessian del self.hessian dead = paddle.where(paddle.diag(H) == 0) - H[dead, dead] = 1 - W[:, dead] = 0 + if dead[0].shape[0] != 0: + H[dead, dead] = 1 + W[:, dead] = 0 del dead if actorder: perm = paddle.argsort(paddle.diag(H), descending=True) @@ -122,9 +123,15 @@ def fasterquant(self, damp = percdamp * paddle.mean(paddle.diag(H)) diag = paddle.arange(self.columns) H[diag, diag] += damp - - H = paddle.inverse(H) - H = paddle.linalg.cholesky(H, upper=True) + try: + H = paddle.inverse(H) + H = paddle.linalg.cholesky(H, upper=True) + except: + print('We skip GPTQ this layer now.') + print( + 'If you want GPTQ this layer, please try setting damp_percent larger or increasing the number of samples.' + ) + return Hinv = H for i1 in range(0, self.columns, blocksize): @@ -182,4 +189,4 @@ def fasterquant(self, self.quantized = True del H, Q, Hinv, W, Losses - paddle.device.cuda.empty_cache() + paddle.device.cuda.empty_cache() \ No newline at end of file diff --git a/paddleslim/quant/advanced/piecewise_search.py b/paddleslim/quant/advanced/piecewise_search.py index e326f2e55..a95b2a1c7 100644 --- a/paddleslim/quant/advanced/piecewise_search.py +++ b/paddleslim/quant/advanced/piecewise_search.py @@ -97,6 +97,8 @@ def search(self, layer_name, sampled_input, act_abs_max, weight): mask_for_search = paddle.where(labels == centroids.argsort()[i], 1., 0.) mask_for_ones = paddle.where(mask_for_search == 0., 1., 0.) + mask_for_search = mask_for_search.cast(dtype) + mask_for_ones = mask_for_ones.cast(dtype) while alpha <= alpha_max: if alpha < 1: @@ -125,6 +127,7 @@ def search(self, layer_name, sampled_input, act_abs_max, weight): if smooth_scale_out is not None: mask_for_ones_new = paddle.where( smooth_scale_out == 0., 1., 0.) + mask_for_ones_new = mask_for_ones_new.cast(dtype) mask_for_ones *= mask_for_ones_new smooth_scale_ = smooth_scale_out + smooth_scale smooth_scale_tmp = smooth_scale_ + mask_for_ones From 0ceafa62df51a6940f30ba991b76bc5f6e92ed94 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 23 Jan 2024 12:23:46 +0000 Subject: [PATCH 4/9] nlp bug --- example/auto_compression/nlp/README.md | 10 ++++---- .../nlp/configs/ernie3.0/tnews.yaml | 23 +++++++++++-------- .../nlp/configs/pp-minilm/auto/afqmc.yaml | 20 ++++++---------- .../nlp/paddle_inference_eval.py | 23 ++++++++++++------- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index c98f1987e..5448be632 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -56,16 +56,16 @@ #### 3.1 准备环境 - python >= 3.6 -- PaddlePaddle >= 2.4 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) -- PaddleSlim >= 2.4 -- PaddleNLP >= 2.3 +- PaddlePaddle ==2.5 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)下载安装) +- PaddleSlim ==2.5 +- PaddleNLP ==2.6 安装paddlepaddle: ```shell # CPU -pip install paddlepaddle==2.4.1 +pip install paddlepaddle==2.5.0 # GPU 以Ubuntu、CUDA 11.2为例 -python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +python -m pip install paddlepaddle-gpu==2.5.0.post116 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html ``` 安装paddleslim: diff --git a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml index 49093ab87..b90da628a 100644 --- a/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml +++ b/example/auto_compression/nlp/configs/ernie3.0/tnews.yaml @@ -6,12 +6,17 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TrainConfig: - epochs: 6 - eval_iter: 1110 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.5700 + +# 剪枝 +Prune: + prune_algo: transformer_pruner + pruned_ratio: 0.25 + +# 离线量化 +QuantPost: + activation_bits: 8 + quantize_op_types: + - depthwise_conv2d + - conv2d + weight_bits: 8 + \ No newline at end of file diff --git a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml index 9c9f58826..fdf65673b 100644 --- a/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml +++ b/example/auto_compression/nlp/configs/pp-minilm/auto/afqmc.yaml @@ -6,17 +6,11 @@ Global: dataset: clue batch_size: 16 max_seq_length: 128 -TransformerPrune: - pruned_ratio: 0.25 -HyperParameterOptimization: -Distillation: + +#离线量化 QuantPost: -TrainConfig: - epochs: 6 - eval_iter: 1070 - learning_rate: 2.0e-5 - optimizer_builder: - optimizer: - type: AdamW - weight_decay: 0.01 - origin_metric: 0.7403 + activation_bits: 8 + quantize_op_types: + - conv2d + - depthwise_conv2d + weight_bits: 8 \ No newline at end of file diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py index f48e20698..f61a649c0 100644 --- a/example/auto_compression/nlp/paddle_inference_eval.py +++ b/example/auto_compression/nlp/paddle_inference_eval.py @@ -91,7 +91,8 @@ def parse_args(): "--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " + help= + "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--perf_warmup_steps", @@ -107,7 +108,8 @@ def parse_args(): type=str, default="fp32", choices=["fp32", "fp16", "int8"], - help="The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", + help= + "The precision of inference. It can be 'fp32', 'fp16' or 'int8'. Default is 'fp16'.", ) parser.add_argument( "--use_mkldnn", @@ -156,8 +158,7 @@ def _convert_example(example, } elif "target" in example: # wsc text, query, pronoun, query_idx, pronoun_idx = ( - example["text"], - example["target"]["span1_text"], + example["text"], example["target"]["span1_text"], example["target"]["span2_text"], example["target"]["span1_index"], example["target"]["span2_index"], ) @@ -209,6 +210,12 @@ def create_predictor(cls, args): config = paddle.inference.Config( os.path.join(args.model_path, args.model_filename), os.path.join(args.model_path, args.params_filename)) + config.switch_ir_debug(True) + # 适用于ERNIE 3.0-Medium模型 + # config.exp_disable_tensorrt_ops(["elementwise_add"]) + # config.exp_disable_tensorrt_ops(["fused_embedding_eltwise_layernorm"]) + # config.exp_disable_tensorrt_ops(["tmp_3"]) + if args.device == "gpu": # set GPU configs accordingly config.enable_use_gpu(100, 0) @@ -231,7 +238,7 @@ def create_predictor(cls, args): config.enable_tensorrt_engine( workspace_size=1 << 30, max_batch_size=args.batch_size, - min_subgraph_size=5, + min_subgraph_size=76, precision_mode=precision_map[args.precision], use_static=True, use_calib_mode=False, ) @@ -239,8 +246,8 @@ def create_predictor(cls, args): dynamic_shape_file = os.path.join(args.model_path, "dynamic_shape.txt") if os.path.exists(dynamic_shape_file): - config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, - True) + config.enable_tuned_tensorrt_dynamic_shape( + dynamic_shape_file, True) print("trt set dynamic shape done!") else: config.collect_shape_range_info(dynamic_shape_file) @@ -365,4 +372,4 @@ def main(): if __name__ == "__main__": paddle.set_device("cpu") - main() + main() \ No newline at end of file From 09fafed18adc5631bde4ac6a545762ede718572c Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 23 Jan 2024 12:40:40 +0000 Subject: [PATCH 5/9] nlp bug --- example/auto_compression/nlp/README.md | 38 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index 5448be632..da35eebcc 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -95,7 +95,6 @@ pip install paddlenlp |:------:|:------:|:------:|:------:|:------:|:-----------:|:------:|:------:| | PP-MiniLM | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/afqmc.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/tnews.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/iflytek.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/cmnli.tar) | [ ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/ocnli.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/cluewsc.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/csl.tar) | | ERNIE 3.0-Medium | [afqmc](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/AFQMC.tar) | [tnews](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/TNEWS.tar) | [iflytek](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/IFLYTEK.tar) | [cmnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CMNLI.tar) | [ocnli](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/OCNLI.tar) | [cluewsc2020](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CLUEWSC2020.tar) | [csl](https://bj.bcebos.com/v1/paddle-slim-models/act/NLP/ernie3.0-medium/fp32_models/CSL.tar) | -| UIE-base | [报销工单](https://bj.bcebos.com/v1/paddle-slim-models/act/uie_base.tar) | 从上表获得模型超链接, 并用以下命令下载推理模型文件: @@ -119,11 +118,6 @@ export CUDA_VISIBLE_DEVICES=0 python run.py --config_path='./configs/pp-minilm/auto/afqmc.yaml' --save_dir='./save_afqmc_pruned/' ``` -自动压缩UIE系列模型需要使用 run_uie.py 脚本启动,会使用接口```paddleslim.auto_compression.AutoCompression```对模型进行自动压缩。配置config文件中训练部分的参数,将任务名称、模型类型、数据集名称、压缩参数传入,配置完成后便可对模型进行蒸馏量化训练。 -```shell -export CUDA_VISIBLE_DEVICES=0 -python run_uie.py --config_path='./configs/uie/uie_base.yaml' --save_dir='./save_uie_qat/' -``` 如仅需验证模型精度,或验证压缩之后模型精度,在启动```run.py```脚本时,将配置文件中模型文件夹 ```model_dir``` 改为压缩之后保存的文件夹路径 ```./save_afqmc_pruned``` ,命令加上```--eval True```即可: ```shell @@ -217,8 +211,6 @@ QuantPost: - TensorRT预测: -环境配置:如果使用 TesorRT 预测引擎,需安装 ```WITH_TRT=ON``` 的Paddle,下载地址:[Python预测库](https://paddleinference.paddlepaddle.org.cn/master/user_guides/download_lib.html#python) - 首先下载量化好的模型: ```shell wget https://bj.bcebos.com/v1/paddle-slim-models/act/save_ppminilm_afqmc_new_calib.tar @@ -227,10 +219,30 @@ tar -xf save_ppminilm_afqmc_new_calib.tar ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ + --task_name='afqmc' \ + --use_trt \ + --precision=int8 +``` + +- ERNIE 3.0-Medium: +```shell +python paddle_inference_eval.py \ + --model_path=TNEWS \ --model_filename=infer.pdmodel \ --params_filename=infer.pdiparams \ - --task_name='afqmc' \ + --task_name='tnews' \ + --use_trt \ + --precision=fp32 +``` +```shell +python paddle_inference_eval.py \ + --model_path=save_tnews_pruned \ + --model_filename=infer.pdmodel \ + --params_filename=infer.pdiparams \ + --task_name='tnews' \ --use_trt \ --precision=int8 ``` @@ -239,9 +251,9 @@ python paddle_inference_eval.py \ ```shell python paddle_inference_eval.py \ - --model_path=save_ernie3_afqmc_new_cablib \ - --model_filename=infer.pdmodel \ - --params_filename=infer.pdiparams \ + --model_path=save_ppminilm_afqmc_new_calib \ + --model_filename=inference.pdmodel \ + --params_filename=inference.pdiparams \ --task_name='afqmc' \ --device=cpu \ --use_mkldnn=True \ From 39d0402d15e63a54442a3213ce50ad2d054e6fd0 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 23 Jan 2024 12:43:09 +0000 Subject: [PATCH 6/9] nlp bug --- example/auto_compression/nlp/paddle_inference_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py index f61a649c0..073f032e5 100644 --- a/example/auto_compression/nlp/paddle_inference_eval.py +++ b/example/auto_compression/nlp/paddle_inference_eval.py @@ -238,7 +238,7 @@ def create_predictor(cls, args): config.enable_tensorrt_engine( workspace_size=1 << 30, max_batch_size=args.batch_size, - min_subgraph_size=76, + min_subgraph_size=5, precision_mode=precision_map[args.precision], use_static=True, use_calib_mode=False, ) From 09905eb21fca7f4b1aa8042baa7d07a9805c0be2 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 1 Feb 2024 07:39:49 +0000 Subject: [PATCH 7/9] =?UTF-8?q?config.switch=5Fir=5Fdebug(True)=E5=85=B3?= =?UTF-8?q?=E6=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/auto_compression/nlp/paddle_inference_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/auto_compression/nlp/paddle_inference_eval.py b/example/auto_compression/nlp/paddle_inference_eval.py index 073f032e5..119a5ad8d 100644 --- a/example/auto_compression/nlp/paddle_inference_eval.py +++ b/example/auto_compression/nlp/paddle_inference_eval.py @@ -210,7 +210,7 @@ def create_predictor(cls, args): config = paddle.inference.Config( os.path.join(args.model_path, args.model_filename), os.path.join(args.model_path, args.params_filename)) - config.switch_ir_debug(True) + # config.switch_ir_debug(True) # 适用于ERNIE 3.0-Medium模型 # config.exp_disable_tensorrt_ops(["elementwise_add"]) # config.exp_disable_tensorrt_ops(["fused_embedding_eltwise_layernorm"]) From 137887a10827ffe343f8a30ae1418fc226a86605 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 1 Feb 2024 08:00:57 +0000 Subject: [PATCH 8/9] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/auto_compression/nlp/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index da35eebcc..3b17ec6f2 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -206,8 +206,27 @@ QuantPost: ## 5. 预测部署 + 量化模型在GPU上可以使用TensorRT进行加速,在CPU上可以使用MKLDNN进行加速。 +以下字段用于配置预测参数: + +| 参数名 | 含义 | +|:------:|:------:| +| model_path | inference 模型文件所在目录,该目录下需要有文件 model.pdmodel 和 model.pdiparams 两个文件 | +| model_filename | 模型文件的名称,默认值为inference.pdmodel | +| params_filename | 参数文件的名称,默认值为inference.pdiparams | +| task_name | 要执行的任务名称,默认值为afqmc | +| dataset | 模型使用的数据集,默认值为clue | +| device | 用于推理的设备,默认为gpu,可选cpu或gpu | +| batch_size | 推理时的batch size,默认为32 | +| max_seq_len | 输入序列在分词后的最大长度,默认值为128,如果序列长于此值,将会被截断;如果短于此值,将会被填充| +| perf_warmup_steps | 性能测试的预热步数,默认值为20 | +| use_trt | 一个标志(flag),用于决定是否使用TensorRT推理 | +| precision | 推理精度,默认为fp32,可选fp16或int8 | +| use_mkldnn | 一个标志(flag),用于决定是否使用MKLDNN推理 | +| cpu_threads | CPU线程数,默认为1 | + - TensorRT预测: From 4c0a909a3d10d94d44bb9c9304be570f23fa9f60 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Fri, 2 Feb 2024 02:57:26 +0000 Subject: [PATCH 9/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9nlp=E7=9A=84readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/auto_compression/nlp/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/auto_compression/nlp/README.md b/example/auto_compression/nlp/README.md index 3b17ec6f2..11ffe7e08 100644 --- a/example/auto_compression/nlp/README.md +++ b/example/auto_compression/nlp/README.md @@ -228,7 +228,7 @@ QuantPost: | cpu_threads | CPU线程数,默认为1 | -- TensorRT预测: +#### 5.1 TensorRT预测: 首先下载量化好的模型: ```shell