diff --git a/.gitignore b/.gitignore index c7456765..0de2613d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ __pycache__/ # Example generated files. example_data_*/ +_codeql_build_dir/ +_codeql_detected_source_root diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 9f492499..663b96e1 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -216,12 +216,12 @@ struct CosineSimilarityImpl { ///// // Shared implementation among those that use floating-point arithmetic. -template struct CosineFloatOp; +template struct CosineFloatOp; SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) #if SVS_AVX512_F -template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { +template <> struct CosineFloatOp<16, AVX_AVAILABILITY::AVX512> : public svs::simd::ConvertToFloat<16> { using parent = svs::simd::ConvertToFloat<16>; using mask_t = typename parent::mask_t; @@ -287,7 +287,7 @@ struct CosineSimilarityImpl { (a_norm * b_norm); } // Fallback to AVX512 - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -320,7 +320,7 @@ struct CosineSimilarityImpl { (a_norm * b_norm); } // Fallback to AVX512 - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -331,7 +331,7 @@ struct CosineSimilarityImpl { template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -340,7 +340,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); }; }; @@ -349,7 +349,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); }; }; @@ -358,7 +358,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -367,7 +367,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -376,7 +376,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -391,7 +391,7 @@ SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template <> struct CosineFloatOp<8> : public svs::simd::ConvertToFloat<8> { +template <> struct CosineFloatOp<8, AVX_AVAILABILITY::AVX2> : public svs::simd::ConvertToFloat<8> { using parent = svs::simd::ConvertToFloat<8>; using mask_t = typename parent::mask_t; static constexpr size_t simd_width = 8; @@ -432,7 +432,7 @@ template <> struct CosineFloatOp<8> : public svs::simd::ConvertToFloat<8> { template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -440,7 +440,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); }; }; @@ -448,7 +448,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>(), a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>(), a, b, length); return sum / (std::sqrt(norm) * a_norm); }; }; @@ -456,7 +456,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -464,7 +464,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -473,7 +473,7 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -481,7 +481,7 @@ struct CosineSimilarityImpl { template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; @@ -490,36 +490,10 @@ template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { - auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8>{}, a, b, length); + auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); return sum / (std::sqrt(norm) * a_norm); } }; -#endif - -#if defined(__x86_64__) - -#include "svs/multi-arch/x86/preprocessor.h" -// TODO: connect with dim_supported_list -DISTANCE_CS_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); - -DISTANCE_CS_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(160, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); - #endif } // namespace svs::distance diff --git a/include/svs/core/distance/euclidean.h b/include/svs/core/distance/euclidean.h index b038a6fc..e56893ed 100644 --- a/include/svs/core/distance/euclidean.h +++ b/include/svs/core/distance/euclidean.h @@ -228,16 +228,16 @@ template struct L2Imp // SIMD accelerated operations that convert both left and right hand arguments to // ``float`` and perform arithmetic on those floating point operands. -template struct L2FloatOp; +template struct L2FloatOp; // SIMD accelerated operations that convert both left and right hand arguments to // ``To`` and perform arithmetic on those integer operands. -template struct L2VNNIOp; +template struct L2VNNIOp; SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) #if SVS_AVX512_F -template <> struct L2FloatOp<16> : public svs::simd::ConvertToFloat<16> { +template <> struct L2FloatOp<16, AVX_AVAILABILITY::AVX512> : public svs::simd::ConvertToFloat<16> { using parent = svs::simd::ConvertToFloat<16>; using mask_t = typename parent::mask_t; @@ -262,7 +262,7 @@ template <> struct L2FloatOp<16> : public svs::simd::ConvertToFloat<16> { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI { +template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI { using parent = svs::simd::ConvertForVNNI; using reg_t = typename parent::reg_t; using mask_t = typename parent::mask_t; @@ -294,10 +294,10 @@ template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { if (__builtin_expect(svs::detail::avx_runtime_flags.is_avx512vnni_supported(), 1)) { - return simd::generic_simd_op(L2VNNIOp(), a, b, length); + return simd::generic_simd_op(L2VNNIOp(), a, b, length); } // fallback to AVX512 - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; @@ -305,10 +305,10 @@ template struct L2Impl SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { if (__builtin_expect(svs::detail::avx_runtime_flags.is_avx512vnni_supported(), 1)) { - return simd::generic_simd_op(L2VNNIOp(), a, b, length); + return simd::generic_simd_op(L2VNNIOp(), a, b, length); } // fallback to AVX512 - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; @@ -318,42 +318,42 @@ template struct L2Impl template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); }; }; template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); }; }; template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); }; }; @@ -367,7 +367,7 @@ SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template <> struct L2FloatOp<8> : public svs::simd::ConvertToFloat<8> { +template <> struct L2FloatOp<8, AVX_AVAILABILITY::AVX2> : public svs::simd::ConvertToFloat<8> { using parent = svs::simd::ConvertToFloat<8>; using mask_t = typename parent::mask_t; static constexpr size_t simd_width = 8; @@ -393,71 +393,45 @@ template <> struct L2FloatOp<8> : public svs::simd::ConvertToFloat<8> { template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { - return simd::generic_simd_op(L2FloatOp<8>{}, a, b, length); + return simd::generic_simd_op(L2FloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; #endif -#if defined(__x86_64__) - -#include "svs/multi-arch/x86/preprocessor.h" - -// TODO: connect with dim_supported_list -DISTANCE_L2_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); - -DISTANCE_L2_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); -#endif - } // namespace svs::distance diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 0f7837a5..ca2929f4 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -187,16 +187,17 @@ template struct IPImp ///// // Shared implementation among those that use floating-point arithmetic. -template struct IPFloatOp; +// Parameterized by AVX_AVAILABILITY to enable architecture-specific instantiation +template struct IPFloatOp; // SIMD accelerated operations that convert both left and right hand arguments to // ``To`` and perform arithmetic on those integer operands. -template struct IPVNNIOp; +template struct IPVNNIOp; SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) #if SVS_AVX512_F -template <> struct IPFloatOp<16> : public svs::simd::ConvertToFloat<16> { +template <> struct IPFloatOp<16, AVX_AVAILABILITY::AVX512> : public svs::simd::ConvertToFloat<16> { using parent = svs::simd::ConvertToFloat<16>; using mask_t = typename parent::mask_t; @@ -219,7 +220,7 @@ template <> struct IPFloatOp<16> : public svs::simd::ConvertToFloat<16> { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI { +template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI { using parent = svs::simd::ConvertForVNNI; using reg_t = typename parent::reg_t; using mask_t = typename parent::mask_t; @@ -248,10 +249,10 @@ template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { if (__builtin_expect(svs::detail::avx_runtime_flags.is_avx512vnni_supported(), 1)) { - return simd::generic_simd_op(IPVNNIOp(), a, b, length); + return simd::generic_simd_op(IPVNNIOp(), a, b, length); } // fallback to AVX512 - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; @@ -259,10 +260,10 @@ template struct IPImpl SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { if (__builtin_expect(svs::detail::avx_runtime_flags.is_avx512vnni_supported(), 1)) { - return simd::generic_simd_op(IPVNNIOp(), a, b, length); + return simd::generic_simd_op(IPVNNIOp(), a, b, length); } // fallback to AVX512 - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; @@ -272,42 +273,42 @@ template struct IPImpl template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); }; }; template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); }; }; template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<16, AVX_AVAILABILITY::AVX512>{}, a, b, length); } }; #endif @@ -320,7 +321,7 @@ SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template <> struct IPFloatOp<8> : public svs::simd::ConvertToFloat<8> { +template <> struct IPFloatOp<8, AVX_AVAILABILITY::AVX2> : public svs::simd::ConvertToFloat<8> { using parent = svs::simd::ConvertToFloat<8>; using mask_t = typename parent::mask_t; static constexpr size_t simd_width = 8; @@ -344,70 +345,45 @@ template <> struct IPFloatOp<8> : public svs::simd::ConvertToFloat<8> { template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { - return svs::simd::generic_simd_op(IPFloatOp<8>{}, a, b, length); + return svs::simd::generic_simd_op(IPFloatOp<8, AVX_AVAILABILITY::AVX2>{}, a, b, length); } }; #endif -#if defined(__x86_64__) - -#include "svs/multi-arch/x86/preprocessor.h" -// TODO: connect with dim_supported_list -DISTANCE_IP_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); - -DISTANCE_IP_EXTERN_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(160, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_EXTERN_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); -#endif - } // namespace svs::distance diff --git a/include/svs/multi-arch/x86/avx2.cpp b/include/svs/multi-arch/x86/avx2.cpp index bff53ae1..3e5c2dd2 100644 --- a/include/svs/multi-arch/x86/avx2.cpp +++ b/include/svs/multi-arch/x86/avx2.cpp @@ -14,44 +14,50 @@ * limitations under the License. */ +/** + * @file avx2.cpp + * @brief AVX2 specific SIMD operation instantiations + * + * This compilation unit is built with corresponding compiler flags that enable + * AVX2 instruction generation. It contains explicit instantiations of SIMD operation + * structs to force the compiler to generate optimized code using AVX2 intrinsics. + * + * The SIMD ops contain all AVX-specific code and are defined in the distance headers + * within #if !SVS_AVX512_F && SVS_AVX2 guards. By instantiating them here with proper + * compiler flags, we ensure optimized machine code is generated and linked into the + * library. + */ + #if defined(__x86_64__) + +// Include distance headers to get SIMD op definitions #include "svs/core/distance/cosine.h" #include "svs/core/distance/euclidean.h" #include "svs/core/distance/inner_product.h" namespace svs::distance { -// TODO: connect with dim_supported_list -DISTANCE_L2_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_L2_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); - -DISTANCE_IP_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_IP_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); - -DISTANCE_CS_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX2); -DISTANCE_CS_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2); +///// +///// Inner Product SIMD Ops +///// + +// Instantiate the primary floating-point SIMD op for AVX2 +template struct IPFloatOp<8, AVX_AVAILABILITY::AVX2>; + +///// +///// L2 (Euclidean) SIMD Ops +///// + +// Instantiate the primary floating-point SIMD op for AVX2 +template struct L2FloatOp<8, AVX_AVAILABILITY::AVX2>; + +///// +///// Cosine Similarity SIMD Ops +///// + +// Instantiate the floating-point SIMD op for AVX2 +template struct CosineFloatOp<8, AVX_AVAILABILITY::AVX2>; } // namespace svs::distance -#endif +#endif // defined(__x86_64__) diff --git a/include/svs/multi-arch/x86/avx512.cpp b/include/svs/multi-arch/x86/avx512.cpp index bee150d7..c5d6f538 100644 --- a/include/svs/multi-arch/x86/avx512.cpp +++ b/include/svs/multi-arch/x86/avx512.cpp @@ -14,44 +14,55 @@ * limitations under the License. */ +/** + * @file avx512.cpp + * @brief AVX-512 specific SIMD operation instantiations + * + * This compilation unit is built with corresponding compiler flags that enable + * AVX-512 instruction generation. It contains explicit instantiations of SIMD operation + * structs to force the compiler to generate optimized code using AVX-512 intrinsics. + * + * The SIMD ops contain all AVX-specific code and are defined in the distance headers + * within #if SVS_AVX512_F guards. By instantiating them here with proper compiler flags, + * we ensure optimized machine code is generated and linked into the library. + */ + #if defined(__x86_64__) + +// Include distance headers to get SIMD op definitions #include "svs/core/distance/cosine.h" #include "svs/core/distance/euclidean.h" #include "svs/core/distance/inner_product.h" namespace svs::distance { -// TODO: connect with dim_supported_list -DISTANCE_L2_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_L2_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); - -DISTANCE_IP_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_IP_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); - -DISTANCE_CS_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(128, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(160, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(200, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX512); -DISTANCE_CS_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512); +///// +///// Inner Product SIMD Ops +///// + +// Instantiate the primary floating-point SIMD op for AVX-512 +template struct IPFloatOp<16, AVX_AVAILABILITY::AVX512>; + +// Instantiate VNNI integer operation for AVX-512 +template struct IPVNNIOp; + +///// +///// L2 (Euclidean) SIMD Ops +///// + +// Instantiate the primary floating-point SIMD op for AVX-512 +template struct L2FloatOp<16, AVX_AVAILABILITY::AVX512>; + +// Instantiate VNNI integer operation for AVX-512 +template struct L2VNNIOp; + +///// +///// Cosine Similarity SIMD Ops +///// + +// Instantiate the floating-point SIMD op for AVX-512 +template struct CosineFloatOp<16, AVX_AVAILABILITY::AVX512>; } // namespace svs::distance -#endif +#endif // defined(__x86_64__) diff --git a/include/svs/multi-arch/x86/preprocessor.h b/include/svs/multi-arch/x86/preprocessor.h deleted file mode 100644 index 4e0cb941..00000000 --- a/include/svs/multi-arch/x86/preprocessor.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2025 Intel Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#define DISTANCE_L2_TEMPLATE_HELPER(SPEC, N, AVX) \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; \ - SPEC struct L2Impl; - -#define DISTANCE_L2_INSTANTIATE_TEMPLATE(N, AVX) \ - DISTANCE_L2_TEMPLATE_HELPER(template, N, AVX); - -#define DISTANCE_L2_EXTERN_TEMPLATE(N, AVX) \ - DISTANCE_L2_TEMPLATE_HELPER(extern template, N, AVX); - -#define DISTANCE_IP_TEMPLATE_HELPER(SPEC, N, AVX) \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; \ - SPEC struct IPImpl; - -#define DISTANCE_IP_INSTANTIATE_TEMPLATE(N, AVX) \ - DISTANCE_IP_TEMPLATE_HELPER(template, N, AVX); - -#define DISTANCE_IP_EXTERN_TEMPLATE(N, AVX) \ - DISTANCE_IP_TEMPLATE_HELPER(extern template, N, AVX); - -#define DISTANCE_CS_TEMPLATE_HELPER(SPEC, N, AVX) \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; \ - SPEC struct CosineSimilarityImpl; - -#define DISTANCE_CS_INSTANTIATE_TEMPLATE(N, AVX) \ - DISTANCE_CS_TEMPLATE_HELPER(template, N, AVX); - -#define DISTANCE_CS_EXTERN_TEMPLATE(N, AVX) \ - DISTANCE_CS_TEMPLATE_HELPER(extern template, N, AVX);