|
| 1 | +// Copyright (C) 2020 Intel Corporation |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | +// |
| 4 | +#include "cpu_kernel.hpp" |
| 5 | +#include "op.hpp" |
| 6 | +#include <details/ie_exception.hpp> |
| 7 | +#include <ie_layouts.h> |
| 8 | +#include "ie_parallel.hpp" |
| 9 | + |
| 10 | +using namespace TemplateExtension; |
| 11 | + |
| 12 | +//! [cpu_implementation:ctor] |
| 13 | +LSTSQImpl::LSTSQImpl(const std::shared_ptr<ngraph::Node> &node) { |
| 14 | + try { |
| 15 | + auto castedNode = std::dynamic_pointer_cast<LSTSQOp>(node); |
| 16 | + if (!castedNode) |
| 17 | + THROW_IE_EXCEPTION << "Cannot create implementation for unknown operation!"; |
| 18 | + if (castedNode->inputs().size() != 2 || castedNode->outputs().size() != 1) |
| 19 | + THROW_IE_EXCEPTION << "Cannot create implementation for operation with incorrect number of inputs or outputs!"; |
| 20 | + if (castedNode->get_input_partial_shape(0).is_dynamic() || castedNode->get_output_partial_shape(0).is_dynamic()) |
| 21 | + THROW_IE_EXCEPTION << "Cannot create implementation for op with dynamic shapes!"; |
| 22 | + if (castedNode->get_input_shape(0).size() != 2 || castedNode->get_output_shape(0).size() != 2) |
| 23 | + THROW_IE_EXCEPTION << "Operation supports only 4d tensors for input and output."; |
| 24 | + if (castedNode->get_input_element_type(0) != ngraph::element::f32 || castedNode->get_output_element_type(0) != ngraph::element::f32) |
| 25 | + THROW_IE_EXCEPTION << "Operation supports only FP32 tensors."; |
| 26 | + inShapes.resize(2); |
| 27 | + for (int i = 0; i < inShapes.size(); ++i) |
| 28 | + inShapes[i] = castedNode->get_input_shape(i); |
| 29 | + outShape = castedNode->get_output_shape(0); |
| 30 | + } catch (InferenceEngine::details::InferenceEngineException& ex) { |
| 31 | + error = ex.what(); |
| 32 | + } |
| 33 | + |
| 34 | +} |
| 35 | +//! [cpu_implementation:ctor] |
| 36 | + |
| 37 | +//! [cpu_implementation:getSupportedConfigurations] |
| 38 | +InferenceEngine::StatusCode LSTSQImpl::getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig> &conf, |
| 39 | + InferenceEngine::ResponseDesc *resp) noexcept { |
| 40 | + std::vector<InferenceEngine::DataConfig> inDataConfig; |
| 41 | + std::vector<InferenceEngine::DataConfig> outDataConfig; |
| 42 | + // Allow any offset before data |
| 43 | + size_t offset((std::numeric_limits<size_t>::max)()); |
| 44 | + |
| 45 | + // Input shape |
| 46 | + for (const auto& shape : inShapes) |
| 47 | + { |
| 48 | + InferenceEngine::SizeVector order(shape.size()); |
| 49 | + std::iota(order.begin(), order.end(), 0); |
| 50 | + |
| 51 | + InferenceEngine::DataConfig inpConf; |
| 52 | + inpConf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order, offset}); |
| 53 | + inDataConfig.push_back(inpConf); |
| 54 | + } |
| 55 | + |
| 56 | + // Output shape |
| 57 | + InferenceEngine::SizeVector order(outShape.size()); |
| 58 | + std::iota(order.begin(), order.end(), 0); |
| 59 | + |
| 60 | + InferenceEngine::DataConfig outConf; |
| 61 | + outConf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, outShape, {outShape, order, offset}); |
| 62 | + outDataConfig.push_back(outConf); |
| 63 | + |
| 64 | + InferenceEngine::LayerConfig layerConfig; |
| 65 | + layerConfig.inConfs = inDataConfig; |
| 66 | + layerConfig.outConfs = outDataConfig; |
| 67 | + |
| 68 | + conf.push_back(layerConfig); |
| 69 | + return InferenceEngine::StatusCode::OK; |
| 70 | +} |
| 71 | +//! [cpu_implementation:getSupportedConfigurations] |
| 72 | + |
| 73 | +//! [cpu_implementation:init] |
| 74 | +InferenceEngine::StatusCode LSTSQImpl::init(InferenceEngine::LayerConfig &config, InferenceEngine::ResponseDesc *resp) noexcept { |
| 75 | + try { |
| 76 | + if (config.inConfs.size() != 2 || config.outConfs.size() != 1) { |
| 77 | + THROW_IE_EXCEPTION << "Operation cannot be initialized with incorrect number of inputs/outputs!"; |
| 78 | + } |
| 79 | + |
| 80 | + if (config.inConfs[0].desc.getDims().size() != 2 || config.outConfs[0].desc.getDims().size() != 2) { |
| 81 | + THROW_IE_EXCEPTION << "Operation can be initialized only with 2d input/output tensors!"; |
| 82 | + } |
| 83 | + |
| 84 | + if (config.outConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32 || |
| 85 | + config.inConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32) { |
| 86 | + THROW_IE_EXCEPTION << "Operation supports only FP32 precisions!"; |
| 87 | + } |
| 88 | + } catch (InferenceEngine::details::InferenceEngineException& ex) { |
| 89 | + if (resp) { |
| 90 | + strncpy(resp->msg, error.c_str(), sizeof(resp->msg) - 1); |
| 91 | + resp->msg[sizeof(resp->msg)-1] = 0; |
| 92 | + } |
| 93 | + return InferenceEngine::GENERAL_ERROR; |
| 94 | + } |
| 95 | + |
| 96 | + return InferenceEngine::OK; |
| 97 | +} |
| 98 | +//! [cpu_implementation:init] |
| 99 | + |
| 100 | +//! [cpu_implementation:execute] |
| 101 | +InferenceEngine::StatusCode LSTSQImpl::execute(std::vector<InferenceEngine::Blob::Ptr> &inputs, |
| 102 | + std::vector<InferenceEngine::Blob::Ptr> &outputs, |
| 103 | + InferenceEngine::ResponseDesc *resp) noexcept { |
| 104 | + const float* B = inputs[0]->cbuffer().as<float*>(); |
| 105 | + const float* A = inputs[1]->cbuffer().as<float*>(); |
| 106 | + float* out = outputs[0]->buffer().as<float*>(); |
| 107 | + |
| 108 | + // Perform A = QR factorization. This implementation works on A with 2 columns. |
| 109 | + const size_t M = inputs[0]->getTensorDesc().getDims()[0]; |
| 110 | + const size_t N = inputs[0]->getTensorDesc().getDims()[1]; |
| 111 | + |
| 112 | + std::vector<float> Q(M * 2); |
| 113 | + std::vector<float> R(4, 0.0f); |
| 114 | + float norm0 = 0.0f; |
| 115 | + float product = 0.0f; // cross-product between second column of A with first column of Q |
| 116 | + for (int i = 0; i < M; ++i) { |
| 117 | + float val = A[i * 2]; |
| 118 | + product += A[i * 2 + 1] * val; |
| 119 | + norm0 += val * val; |
| 120 | + } |
| 121 | + norm0 = sqrtf(norm0); |
| 122 | + product /= norm0; |
| 123 | + R[1] = product; |
| 124 | + |
| 125 | + float norm1 = 0.0f; |
| 126 | + for (int i = 0; i < M; ++i) { |
| 127 | + float val = A[i * 2] / norm0; |
| 128 | + Q[i * 2] = val; |
| 129 | + R[0] += A[i * 2] * val; |
| 130 | + |
| 131 | + val = A[i * 2 + 1] - product * val; |
| 132 | + Q[i * 2 + 1] = val; |
| 133 | + norm1 += val * val; |
| 134 | + R[3] += A[i * 2 + 1] * val; |
| 135 | + } |
| 136 | + norm1 = sqrtf(norm1); |
| 137 | + for (int i = 0; i < M; ++i) { |
| 138 | + Q[i * 2 + 1] /= norm1; |
| 139 | + } |
| 140 | + R[3] /= norm1; |
| 141 | + |
| 142 | + // Inverse R matrix |
| 143 | + float scale = 1.0f / (R[0] * R[3]); |
| 144 | + std::vector<float> R_inv{R[3] * scale, -R[1] * scale, 0.0f, R[0] * scale}; |
| 145 | + |
| 146 | + // Output is inverse(R) * transpose(Q) * B |
| 147 | + for (int i = 0; i < M; ++i) { |
| 148 | + Q[i * 2] = R_inv[0] * Q[i * 2] + R_inv[1] * Q[i * 2 + 1]; |
| 149 | + Q[i * 2 + 1] *= R_inv[3]; |
| 150 | + } |
| 151 | + |
| 152 | + for (int i = 0; i < N; ++i) { |
| 153 | + out[i] = 0.0f; |
| 154 | + out[N + i] = 0.0f; |
| 155 | + for (int j = 0; j < M; ++j) { |
| 156 | + out[i] += Q[j * 2] * B[j * N + i]; |
| 157 | + out[N + i] += Q[j * 2 + 1] * B[j * N + i]; |
| 158 | + } |
| 159 | + } |
| 160 | + return InferenceEngine::OK; |
| 161 | +} |
| 162 | +//! [cpu_implementation:execute] |
0 commit comments