From 9427841d0052a44d60e861897bd5b75dd68d5436 Mon Sep 17 00:00:00 2001 From: kennbot <5740350+KennBot@users.noreply.github.com> Date: Sat, 15 Jan 2022 17:46:14 -0800 Subject: [PATCH] Fixes issue 47 (clxie). Picks up fixes by rmlarson in the TF2.x repository to this same library, which addressed 'specific integer intrinsics were MIA from the standard header on some versions of GCC', seen in TF2.x issue 39180 'TensorFlow build is failing on Bazel CI due to Eigen Update' (meteorcloudy), fixed on May 5 2020. Signed-off-by: kennbot <5740350+KennBot@users.noreply.github.com> --- .../CXX11/src/FixedPoint/PacketMathAVX.h | 149 ++++++++++++ .../CXX11/src/FixedPoint/PacketMathAVX2.h | 117 +++------- .../CXX11/src/FixedPoint/PacketMathAVX512.h | 221 ++++++++---------- .../CXX11/src/FixedPoint/TypeCastingAVX2.h | 25 +- .../CXX11/src/FixedPoint/TypeCastingAVX512.h | 2 +- 5 files changed, 306 insertions(+), 208 deletions(-) create mode 100644 third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h new file mode 100644 index 00000000000..1a7cd03d498 --- /dev/null +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX.h @@ -0,0 +1,149 @@ +#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ +#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ +#ifdef _MSC_VER + +#include +#include +#include + +#endif + +namespace Eigen { +namespace internal { + +typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i; +typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet32q8i type; + typedef Packet16q8i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet16q8i half; + enum { + size = 32, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef QInt8 type; + typedef Packet16q8i half; + enum { + size = 16, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template <> +EIGEN_STRONG_INLINE Packet32q8i pset1(const QInt8& from) { + return _mm256_set1_epi8(from.value); +} +template <> +EIGEN_STRONG_INLINE Packet32q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q8i ploadu(const QInt8* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128( + reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet32q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16q8i pload(const QInt8* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128( + reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.m_val); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet16q8i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), + from.m_val); +} + +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.m_val); +} +template <> +EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet16q8i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), + from.m_val); +} + +typedef __m256 Packet8f; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast(const Packet8f& a, const Packet8f& b, + const Packet8f& c, const Packet8f& d) { + const __m256i a_conv = _mm256_cvtps_epi32(a); + const __m256i b_conv = _mm256_cvtps_epi32(b); + const __m256i c_conv = _mm256_cvtps_epi32(c); + const __m256i d_conv = _mm256_cvtps_epi32(d); + __m128i low = _mm256_castsi256_si128(a_conv); + __m128i high = _mm256_extractf128_si256(a_conv, 1); + __m128i tmp = _mm_packs_epi32(low, high); + __m128i low2 = _mm256_castsi256_si128(b_conv); + __m128i high2 = _mm256_extractf128_si256(b_conv, 1); + __m128i tmp2 = _mm_packs_epi32(low2, high2); + __m128i converted_low = _mm_packs_epi16(tmp, tmp2); + low = _mm256_castsi256_si128(c_conv); + high = _mm256_extractf128_si256(c_conv, 1); + tmp = _mm_packs_epi32(low, high); + low2 = _mm256_castsi256_si128(d_conv); + high2 = _mm256_extractf128_si256(d_conv, 1); + tmp2 = _mm_packs_epi32(low2, high2); + __m128i converted_high = _mm_packs_epi16(tmp, tmp2); + return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low), + converted_high, 1); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h index 2b16715c723..4c5e02abc9d 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -27,61 +27,14 @@ inline int _mm256_extract_epi8_N1(const __m256i X) { namespace Eigen { namespace internal { -typedef struct Packet32q8i { - __m256i val; - operator __m256i() const { return val; } - Packet32q8i() : val(_mm256_setzero_si256()){}; - Packet32q8i(__m256i val) : val(val) {} -} Packet32q8i; - -typedef struct Packet16q16i { - __m256i val; - operator __m256i() const { return val; } - Packet16q16i() : val(_mm256_setzero_si256()){}; - Packet16q16i(__m256i val) : val(val) {} -} Packet16q16i; - -typedef struct Packet32q8u { - __m256i val; - operator __m256i() const { return val; } - Packet32q8u() : val(_mm256_setzero_si256()){}; - Packet32q8u(__m256i val) : val(val) {} -} Packet32q8u; - -typedef struct Packet16q8i { - __m128i val; - operator __m128i() const { return val; } - Packet16q8i() : val(_mm_setzero_si128()) {} - Packet16q8i(__m128i val) : val(val) {} -} Packet16q8i; - -typedef struct Packet16q8u { - __m128i val; - operator __m128i() const { return val; } - Packet16q8u() : val(_mm_setzero_si128()) {} - Packet16q8u(__m128i val) : val(val) {} -} Packet16q8u; - -typedef struct Packet8q16i { - __m128i val; - operator __m128i() const { return val; } - Packet8q16i() : val(_mm_setzero_si128()) {} - Packet8q16i(__m128i val) : val(val) {} -} Packet8q16i; - -typedef struct Packet8q32i { - __m256i val; - operator __m256i() const { return val; } - Packet8q32i() : val(_mm256_setzero_si256()){}; - Packet8q32i(__m256i val) : val(val) {} -} Packet8q32i; - -typedef struct Packet4q32i { - __m128i val; - operator __m128i() const { return val; } - Packet4q32i() : val(_mm_setzero_si128()) {} - Packet4q32i(__m128i val) : val(val) {} -} Packet4q32i; +typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i; +typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i; +typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u; +typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i; +typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u; +typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i; +typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i; +typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i; #ifndef EIGEN_VECTORIZE_AVX512 template <> @@ -315,64 +268,64 @@ EIGEN_STRONG_INLINE Packet8q32i pload(const QInt32* from) { template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet16q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet8q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( - reinterpret_cast<__m256i*>(to), from.val); + reinterpret_cast<__m256i*>(to), from.m_val); } // Aligned store template <> EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet16q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet8q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet32q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet16q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), - from.val); + from.m_val); } // Extract first element. @@ -382,15 +335,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet8q32i& a) { } template <> EIGEN_STRONG_INLINE QInt16 pfirst(const Packet16q16i& a) { - return _mm256_extract_epi16_N0(a.val); + return _mm256_extract_epi16_N0(a.m_val); } template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet32q8u& a) { - return static_cast(_mm256_extract_epi8_N0(a.val)); + return static_cast(_mm256_extract_epi8_N0(a.m_val)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet32q8i& a) { - return _mm256_extract_epi8_N0(a.val); + return _mm256_extract_epi8_N0(a.m_val); } // Initialize to constant value. @@ -411,7 +364,7 @@ EIGEN_STRONG_INLINE Packet8q32i pset1(const QInt32& from) { template <> EIGEN_STRONG_INLINE Packet8q32i padd(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_add_epi32(a.val, b.val); + return _mm256_add_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { @@ -420,62 +373,62 @@ EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { template <> EIGEN_STRONG_INLINE Packet8q32i psub(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_sub_epi32(a.val, b.val); + return _mm256_sub_epi32(a.m_val, b.m_val); } // Note: mullo truncates the result to 32 bits. template <> EIGEN_STRONG_INLINE Packet8q32i pmul(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_mullo_epi32(a.val, b.val); + return _mm256_mullo_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet8q32i pnegate(const Packet8q32i& a) { - return _mm256_sub_epi32(_mm256_setzero_si256(), a.val); + return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val); } // Min and max. template <> EIGEN_STRONG_INLINE Packet8q32i pmin(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_min_epi32(a.val, b.val); + return _mm256_min_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet8q32i pmax(const Packet8q32i& a, const Packet8q32i& b) { - return _mm256_max_epi32(a.val, b.val); + return _mm256_max_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmin(const Packet16q16i& a, const Packet16q16i& b) { - return _mm256_min_epi16(a.val, b.val); + return _mm256_min_epi16(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q16i pmax(const Packet16q16i& a, const Packet16q16i& b) { - return _mm256_max_epi16(a.val, b.val); + return _mm256_max_epi16(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmin(const Packet32q8u& a, const Packet32q8u& b) { - return _mm256_min_epu8(a.val, b.val); + return _mm256_min_epu8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8u pmax(const Packet32q8u& a, const Packet32q8u& b) { - return _mm256_max_epu8(a.val, b.val); + return _mm256_max_epu8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmin(const Packet32q8i& a, const Packet32q8i& b) { - return _mm256_min_epi8(a.val, b.val); + return _mm256_min_epi8(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet32q8i pmax(const Packet32q8i& a, const Packet32q8i& b) { - return _mm256_max_epi8(a.val, b.val); + return _mm256_max_epi8(a.m_val, b.m_val); } // Reductions. diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h index 6c77aa7b511..5a0ae2e8c8c 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h @@ -6,33 +6,10 @@ namespace Eigen { namespace internal { -typedef struct Packet64q8i { - __m512i val; - operator __m512i() const { return val; } - Packet64q8i(); - Packet64q8i(__m512i val) : val(val) {} -} Packet64q8i; - -typedef struct Packet32q16i { - __m512i val; - operator __m512i() const { return val; } - Packet32q16i(); - Packet32q16i(__m512i val) : val(val) {} -} Packet32q16i; - -typedef struct Packet64q8u { - __m512i val; - operator __m512i() const { return val; } - Packet64q8u(); - Packet64q8u(__m512i val) : val(val) {} -} Packet64q8u; - -typedef struct Packet16q32i { - __m512i val; - operator __m512i() const { return val; } - Packet16q32i(); - Packet16q32i(__m512i val) : val(val) {} -} Packet16q32i; +typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i; +typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i; +typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u; +typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i; template <> struct packet_traits : default_packet_traits { @@ -216,44 +193,44 @@ EIGEN_STRONG_INLINE Packet16q32i pload(const QInt32* from) { template <> EIGEN_STRONG_INLINE void pstoreu(QInt8* to, const Packet64q8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet32q16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet64q8u& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet16q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( - reinterpret_cast<__m512i*>(to), from.val); + reinterpret_cast<__m512i*>(to), from.m_val); } // Aligned store template <> EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet16q32i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet64q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt8* to, const Packet64q8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } template <> EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet32q16i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), - from.val); + from.m_val); } // Extract first element. @@ -264,15 +241,15 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet16q32i& a) { template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet64q8u& a) { return static_cast( - _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); + _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet64q8i& a) { - return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0); + return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0); } template <> EIGEN_STRONG_INLINE QInt16 pfirst(const Packet32q16i& a) { - return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.val, 0), 0); + return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0); } // Initialize to constant value. @@ -297,46 +274,46 @@ EIGEN_STRONG_INLINE Packet16q32i pset1(const QInt32& from) { template <> EIGEN_STRONG_INLINE Packet16q32i padd(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_add_epi32(a.val, b.val); + return _mm512_add_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i psub(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_sub_epi32(a.val, b.val); + return _mm512_sub_epi32(a.m_val, b.m_val); } // Note: mullo truncates the result to 32 bits. template <> EIGEN_STRONG_INLINE Packet16q32i pmul(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_mullo_epi32(a.val, b.val); + return _mm512_mullo_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i pnegate(const Packet16q32i& a) { - return _mm512_sub_epi32(_mm512_setzero_si512(), a.val); + return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val); } // Min and max. template <> EIGEN_STRONG_INLINE Packet16q32i pmin(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_min_epi32(a.val, b.val); + return _mm512_min_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet16q32i pmax(const Packet16q32i& a, const Packet16q32i& b) { - return _mm512_max_epi32(a.val, b.val); + return _mm512_max_epi32(a.m_val, b.m_val); } template <> EIGEN_STRONG_INLINE Packet64q8u pmin(const Packet64q8u& a, const Packet64q8u& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epu8(a.val, b.val); + return _mm512_min_epu8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epu8(ap0, bp0); __m256i r1 = _mm256_min_epu8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -346,12 +323,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8u pmax(const Packet64q8u& a, const Packet64q8u& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epu8(a.val, b.val); + return _mm512_max_epu8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epu8(ap0, bp0); __m256i r1 = _mm256_max_epu8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -362,12 +339,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8i pmin(const Packet64q8i& a, const Packet64q8i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epi8(a.val, b.val); + return _mm512_min_epi8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epi8(ap0, bp0); __m256i r1 = _mm256_min_epi8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -377,12 +354,12 @@ template <> EIGEN_STRONG_INLINE Packet32q16i pmin(const Packet32q16i& a, const Packet32q16i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_min_epi16(a.val, b.val); + return _mm512_min_epi16(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_min_epi16(ap0, bp0); __m256i r1 = _mm256_min_epi16(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -392,12 +369,12 @@ template <> EIGEN_STRONG_INLINE Packet64q8i pmax(const Packet64q8i& a, const Packet64q8i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epi8(a.val, b.val); + return _mm512_max_epi8(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epi8(ap0, bp0); __m256i r1 = _mm256_max_epi8(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -407,12 +384,12 @@ template <> EIGEN_STRONG_INLINE Packet32q16i pmax(const Packet32q16i& a, const Packet32q16i& b) { #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_max_epi16(a.val, b.val); + return _mm512_max_epi16(a.m_val, b.m_val); #else - __m256i ap0 = _mm512_extracti32x8_epi32(a.val, 0); - __m256i ap1 = _mm512_extracti32x8_epi32(a.val, 1); - __m256i bp0 = _mm512_extracti32x8_epi32(b.val, 0); - __m256i bp1 = _mm512_extracti32x8_epi32(b.val, 1); + __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); + __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); + __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); + __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); __m256i r0 = _mm256_max_epi16(ap0, bp0); __m256i r1 = _mm256_max_epi16(ap1, bp1); return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); @@ -422,112 +399,112 @@ EIGEN_STRONG_INLINE Packet32q16i pmax(const Packet32q16i& a, // Reductions. template <> EIGEN_STRONG_INLINE QInt32 predux_min(const Packet16q32i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst( - _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + return pfirst(res); } template <> EIGEN_STRONG_INLINE QInt32 predux_max(const Packet16q32i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst( - _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + return pfirst(res); } template <> EIGEN_STRONG_INLINE QInt16 predux_min(const Packet32q16i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt16 predux_max(const Packet32q16i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::max( {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet64q8u& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet64q8u& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::max( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_min(const Packet64q8i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_max(const Packet64q8i& a) { - Packet4i lane0 = _mm512_extracti32x4_epi32(a.val, 0); - Packet4i lane1 = _mm512_extracti32x4_epi32(a.val, 1); - Packet4i lane2 = _mm512_extracti32x4_epi32(a.val, 2); - Packet4i lane3 = _mm512_extracti32x4_epi32(a.val, 3); + Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); + Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); + Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); + Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); Packet4i res = _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = pfirst( - _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); + std::uint32_t w = pfirst(res); return std::min( {static_cast(w >> 24), static_cast(w >> 16), static_cast(w >> 8), static_cast(w)}); diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h index 9561d6a3388..5dd2cd309b8 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h @@ -13,7 +13,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8q32i& a) { - return _mm256_cvtepi32_ps(a.val); + return _mm256_cvtepi32_ps(a.m_val); } template <> @@ -35,14 +35,33 @@ template <> EIGEN_STRONG_INLINE Packet32q8i pcast(const Packet8q32i& a, const Packet8q32i& b, const Packet8q32i& c, const Packet8q32i& d) { - __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.val, b.val), - _mm256_packs_epi32(c.val, d.val)); + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val), + _mm256_packs_epi32(c.m_val, d.m_val)); // Since packs does not cross 128 bit lane boundaries, // we have to permute to properly order the final result. const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); return _mm256_permutevar8x32_epi32(converted, permute_mask); } +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet32q8i +pcast(const Packet8f& a, const Packet8f& b, + const Packet8f& c, const Packet8f& d) { + const __m256i a_conv = _mm256_cvtps_epi32(a); + const __m256i b_conv = _mm256_cvtps_epi32(b); + const __m256i c_conv = _mm256_cvtps_epi32(c); + const __m256i d_conv = _mm256_cvtps_epi32(d); + __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv), + _mm256_packs_epi32(c_conv, d_conv)); + const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + return _mm256_permutevar8x32_epi32(converted, permute_mask); +} + template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h index d3b02402971..17408d13abf 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h @@ -14,7 +14,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16q32i& a) { - return _mm512_cvtepi32_ps(a.val); + return _mm512_cvtepi32_ps(a.m_val); } template <>