Skip to content

Commit fa6000a

Browse files
authored
Added fusing of quantization multiplication to convolution layer (#328) (#341)
1 parent d968109 commit fa6000a

File tree

2 files changed

+87
-14
lines changed

2 files changed

+87
-14
lines changed

modules/arm_plugin/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ The plugin supports IRv10 and higher. The list of supported layers and its limit
3838
## Supported Model Formats
3939
* FP32 – Supported and preferred
4040
* FP16 – Supported
41-
* I8 – Not supported
41+
* I8 – Experimental support
4242

4343
## Supported Input Precision
4444
* FP32 - Supported

modules/arm_plugin/src/transformations/quantize_fusion.cpp

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,13 @@ ArmPlugin::pass::ConvolutionQuantizeFusion::ConvolutionQuantizeFusion() {
156156
opset::Sigmoid, opset::Tanh, opset::Relu, opset::Abs,
157157
opset::Elu, opset::Sqrt, opset::SoftPlus, opset::HSwish,
158158
opset::PRelu, opset::Clamp>({node_pattern});
159-
auto node_output = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{node_pattern, activation_pattern});
159+
auto q_scale = ngraph::pattern::wrap_type<opset::Constant>();
160+
auto q_mul = ngraph::pattern::wrap_type<opset::Multiply>({
161+
std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{node_pattern, activation_pattern}),
162+
q_scale},
163+
ngraph::pattern::consumers_count(1));
160164
auto fq_pattern = ngraph::pattern::wrap_type<opset::FakeQuantize>({
161-
node_output,
165+
std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{node_pattern, activation_pattern, q_mul}),
162166
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
163167
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
164168
ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
@@ -169,6 +173,7 @@ ArmPlugin::pass::ConvolutionQuantizeFusion::ConvolutionQuantizeFusion() {
169173
auto pattern_map = m.get_pattern_value_map();
170174
auto node = pattern_map[node_pattern].get_node_shared_ptr();
171175
auto fakeQuantize = safe_cast<opset::FakeQuantize>(pattern_map[fq_pattern].get_node_shared_ptr());
176+
auto itMul = pattern_map.find(q_mul);
172177
auto itActivation = pattern_map.find(activation_pattern);
173178
auto realType = node->get_output_element_type(0);
174179
auto quantizedType = fakeQuantize->get_output_element_type(0);
@@ -179,6 +184,25 @@ ArmPlugin::pass::ConvolutionQuantizeFusion::ConvolutionQuantizeFusion() {
179184
getFloatVector(fakeQuantize->input_value(2).get_node()),
180185
getFloatVector(fakeQuantize->input_value(3).get_node()),
181186
getFloatVector(fakeQuantize->input_value(4).get_node()));
187+
if (itMul != pattern_map.end()) {
188+
std::vector<float> scales = getFloatVector(pattern_map[q_scale].get_node());
189+
if (allEqualToFirst(scales)) {
190+
if (scales[0] == 0.) return false; // Scale multiplier shouldn't be zero
191+
for (auto&& v : quantizationInfo.first) v *= scales[0];
192+
} else if (quantizationInfo.first.size() > 1) {
193+
if (scales.size() != quantizationInfo.first.size()) return false;
194+
std::transform(quantizationInfo.first.begin(), quantizationInfo.first.end(), scales.begin(),
195+
quantizationInfo.first.begin(), [](float f, float sc) -> float { return f * sc; });
196+
} else {
197+
std::vector<float> qiScales, qiOffsets;
198+
for (auto&& sc : scales) {
199+
qiScales.emplace_back(quantizationInfo.first[0] * sc);
200+
qiOffsets.emplace_back(quantizationInfo.second[0]);
201+
}
202+
quantizationInfo.first.swap(qiScales);
203+
quantizationInfo.second.swap(qiOffsets);
204+
}
205+
}
182206

183207
std::vector<ngraph::Output<ngraph::Node>> newInputs;
184208
Types inputTypes;
@@ -187,27 +211,75 @@ ArmPlugin::pass::ConvolutionQuantizeFusion::ConvolutionQuantizeFusion() {
187211
newInputs.emplace_back(
188212
ngraph::op::TemporaryReplaceOutputType{input.get_source_output(), realType}.get());
189213
}
214+
215+
std::shared_ptr<ngraph::Node> bias;
216+
if (node->inputs().size() > 2) {
217+
bias = node->input_value(2).get_node_shared_ptr();
218+
}
219+
220+
bool negativeScales = std::any_of(std::begin(quantizationInfo.first), std::end(quantizationInfo.first), [] (auto& value) {return value < 0;});
221+
if (negativeScales) {
222+
if (node->get_input_element_type(1) != ngraph::element::i8)
223+
return false;
224+
std::vector<std::int8_t> negate;
225+
std::transform(quantizationInfo.first.begin(), quantizationInfo.first.end(), std::back_inserter(negate),
226+
[](float f) -> std::int8_t { return f < 0 ? -1 : 1; } );
227+
std::transform(quantizationInfo.first.begin(), quantizationInfo.first.end(), quantizationInfo.first.begin(),
228+
[](float f) -> float { return f < 0 ? -f : f; } );
229+
std::shared_ptr<ngraph::Node> weightMultiply;
230+
if (ngraph::is_type<opset::Constant>(node->input_value(1).get_node())) {
231+
std::vector<std::int8_t> weights = safe_cast<const opset::Constant>(node->input_value(1).get_node())->cast_vector<std::int8_t>();
232+
size_t step = weights.size() / negate.size();
233+
auto weightsIt = weights.begin();
234+
for (auto&& sign : negate) {
235+
std::transform(weightsIt, weightsIt + step, weightsIt, [&sign](std::int8_t w) -> std::int8_t { return w * sign; } );
236+
weightsIt += step;
237+
}
238+
weightMultiply = std::make_shared<opset::Constant>(node->get_input_element_type(1), node->get_input_shape(1), weights);
239+
} else {
240+
weightMultiply = std::make_shared<opset::Multiply>(node->input_value(1),
241+
std::make_shared<opset::Constant>(node->get_input_element_type(1),
242+
ngraph::Shape{negate.size(), 1, 1, 1}, negate));
243+
}
244+
weightMultiply->set_friendly_name(node->input_value(1).get_node_shared_ptr()->get_friendly_name() + "_weights_negate");
245+
ngraph::copy_runtime_info(node->input_value(1).get_node_shared_ptr(), weightMultiply);
246+
newInputs[1] = ngraph::op::TemporaryReplaceOutputType{weightMultiply->output(0), ngraph::element::i8}.get();
247+
248+
if (bias) {
249+
bias = std::make_shared<opset::Multiply>(bias,
250+
std::make_shared<opset::Constant>(ngraph::element::f32,
251+
ngraph::Shape{negate.size()}, negate));
252+
bias->set_friendly_name(node->input_value(2).get_node_shared_ptr()->get_friendly_name() + "_bias_negate");
253+
ngraph::copy_runtime_info(node->input_value(2).get_node_shared_ptr(), bias);
254+
newInputs[2] = ngraph::op::TemporaryReplaceOutputType{bias->output(0), realType}.get();
255+
}
256+
}
257+
190258
std::int32_t qiOffset = 0;
191259
if (!allEqualToFirst(quantizationInfo.second)) {
192-
auto shape = ngraph::Shape{quantizationInfo.second.size()};
193-
std::vector<float> invScale;
194-
std::transform(quantizationInfo.first.begin(), quantizationInfo.first.end(), std::back_inserter(invScale),
195-
[](float f) -> float { return 1./f; } );
196-
std::shared_ptr<ngraph::Node> bias = std::make_shared<opset::Multiply>(
197-
std::make_shared<opset::Constant>(ngraph::element::f32, shape, quantizationInfo.second),
198-
std::make_shared<opset::Constant>(ngraph::element::f32, shape, invScale));
199-
OPENVINO_ASSERT(bias, "Failed to create bias node for fused convolution");
200-
if (node->inputs().size() > 2) {
201-
bias = std::make_shared<opset::Add>(node->input_value(2), bias);
260+
std::transform(quantizationInfo.second.begin(), quantizationInfo.second.end(), quantizationInfo.first.begin(),
261+
quantizationInfo.second.begin(), [](float sh, float sc) -> float { return sh / sc; } );
262+
std::shared_ptr<ngraph::Node> zpbias = std::make_shared<opset::Constant>(ngraph::element::f32,
263+
ngraph::Shape{quantizationInfo.second.size()},
264+
quantizationInfo.second);
265+
OPENVINO_ASSERT(zpbias, "Failed to convert zero point to bias node for fused convolution");
266+
if (bias) {
267+
bias = std::make_shared<opset::Add>(bias, zpbias);
268+
bias->set_friendly_name(node->input_value(2).get_node_shared_ptr()->get_friendly_name() + "_bias_fusedzp");
269+
ngraph::copy_runtime_info(node->input_value(2).get_node_shared_ptr(), bias);
202270
newInputs[2] = ngraph::op::TemporaryReplaceOutputType{bias->output(0), realType}.get();
203271
} else {
204272
inputTypes.emplace_back(realType);
205-
newInputs.emplace_back(ngraph::op::TemporaryReplaceOutputType{bias->output(0), realType}.get());
273+
newInputs.emplace_back(ngraph::op::TemporaryReplaceOutputType{zpbias->output(0), realType}.get());
206274
}
207275
} else {
208276
qiOffset = static_cast<std::int32_t>(std::round(quantizationInfo.second[0]));
209277
}
210278
auto newNode = makeTypeRelaxed(node.get(), newInputs, inputTypes, Types{quantizedType});
279+
if (!bias && newInputs.size() == 3 && newNode->inputs().size() != 3) {
280+
//TypeRelaxed operations unable to extend amount of inputs on copy
281+
newNode->set_argument(2, newInputs.at(2));
282+
}
211283

212284
if (itActivation != pattern_map.end()) {
213285
auto activation = itActivation->second.get_node_shared_ptr();
@@ -228,6 +300,7 @@ ArmPlugin::pass::ConvolutionQuantizeFusion::ConvolutionQuantizeFusion() {
228300
if (!allEqualToFirst(quantizationInfo.first)) {
229301
if (node->get_input_element_type(1) != ngraph::element::i8)
230302
return false;
303+
//Is it correct if fused activation exists?
231304
newNode->get_rt_info()["WeightsPrescaleInfo"] =
232305
arm_compute::QuantizationInfo{quantizationInfo.first, std::vector<std::int32_t>(quantizationInfo.first.size(), 0)};
233306
} else {

0 commit comments

Comments
 (0)