Skip to content

Commit 2be3c52

Browse files
Copilotahuber21
andcommitted
Add comprehensive documentation to instantiation system
Co-authored-by: ahuber21 <[email protected]>
1 parent 0ecb060 commit 2be3c52

File tree

3 files changed

+149
-0
lines changed

3 files changed

+149
-0
lines changed

include/svs/multi-arch/x86/avx2.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,46 @@
1414
* limitations under the License.
1515
*/
1616

17+
///
18+
/// @file avx2.cpp
19+
/// @brief Explicit instantiations of distance implementations for AVX2
20+
///
21+
/// This file contains explicit template instantiations for distance computations
22+
/// optimized for Intel(R) AVX2 instructions. It is compiled with compiler flags
23+
/// targeting the Haswell microarchitecture (`-march=haswell`), which includes
24+
/// AVX2, FMA, and related extensions.
25+
///
26+
/// ## Purpose
27+
///
28+
/// By explicitly instantiating templates in this compilation unit with AVX2
29+
/// compiler flags, we enable runtime ISA dispatch: the library can detect at runtime
30+
/// whether the CPU supports AVX2 (but not AVX-512) and call these optimized
31+
/// implementations, while falling back to generic implementations if neither AVX-512
32+
/// nor AVX2 are available.
33+
///
34+
/// ## Architecture
35+
///
36+
/// Each distance implementation (L2Impl, IPImpl, CosineSimilarityImpl) is a thin
37+
/// wrapper around `generic_simd_op` which in turn uses SIMD operation structs like:
38+
/// - `L2FloatOp<8>` - L2 distance using AVX2 floating-point operations (SIMD width 8)
39+
/// - `IPFloatOp<8>` - Inner product using AVX2 floating-point operations
40+
/// - `CosineFloatOp<8>` - Cosine similarity using AVX2 floating-point operations
41+
///
42+
/// Note: AVX2 uses SIMD width 8 (256-bit vectors) vs AVX-512's width 16 (512-bit vectors).
43+
///
44+
/// These SIMD ops are defined in the distance headers and contain the actual AVX2
45+
/// intrinsics. The instantiations here ensure this AVX2 code is generated.
46+
///
47+
/// ## Dimensions Instantiated
48+
///
49+
/// We instantiate for the following dimensionalities:
50+
/// - Fixed: 64, 96, 100, 128, 160, 200, 512, 768
51+
/// - Dynamic: For runtime-determined dimensions
52+
///
53+
/// For each dimension, 16 type combinations are instantiated (4 element types × 4):
54+
/// float, int8_t, uint8_t, Float16
55+
///
56+
1757
#if defined(__x86_64__)
1858
#include "svs/core/distance/cosine.h"
1959
#include "svs/core/distance/euclidean.h"
@@ -22,6 +62,10 @@
2262
namespace svs::distance {
2363

2464
// TODO: connect with dim_supported_list
65+
66+
// ============================================================================
67+
// L2 (Euclidean) Distance Instantiations
68+
// ============================================================================
2569
DISTANCE_L2_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2);
2670
DISTANCE_L2_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2);
2771
DISTANCE_L2_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2);
@@ -32,6 +76,9 @@ DISTANCE_L2_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX2);
3276
DISTANCE_L2_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX2);
3377
DISTANCE_L2_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2);
3478

79+
// ============================================================================
80+
// Inner Product Instantiations
81+
// ============================================================================
3582
DISTANCE_IP_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2);
3683
DISTANCE_IP_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2);
3784
DISTANCE_IP_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2);
@@ -42,6 +89,9 @@ DISTANCE_IP_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX2);
4289
DISTANCE_IP_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX2);
4390
DISTANCE_IP_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX2);
4491

92+
// ============================================================================
93+
// Cosine Similarity Instantiations
94+
// ============================================================================
4595
DISTANCE_CS_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX2);
4696
DISTANCE_CS_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX2);
4797
DISTANCE_CS_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX2);

include/svs/multi-arch/x86/avx512.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,44 @@
1414
* limitations under the License.
1515
*/
1616

17+
///
18+
/// @file avx512.cpp
19+
/// @brief Explicit instantiations of distance implementations for AVX-512
20+
///
21+
/// This file contains explicit template instantiations for distance computations
22+
/// optimized for Intel(R) AVX-512 instructions. It is compiled with compiler flags
23+
/// targeting the Cascade Lake microarchitecture (`-march=cascadelake`), which
24+
/// includes AVX-512F, AVX-512DQ, AVX-512CD, AVX-512BW, and AVX-512VL extensions.
25+
///
26+
/// ## Purpose
27+
///
28+
/// By explicitly instantiating templates in this compilation unit with AVX-512
29+
/// compiler flags, we enable runtime ISA dispatch: the library can detect at runtime
30+
/// whether the CPU supports AVX-512 and call these optimized implementations if
31+
/// available, while falling back to AVX2 or generic implementations otherwise.
32+
///
33+
/// ## Architecture
34+
///
35+
/// Each distance implementation (L2Impl, IPImpl, CosineSimilarityImpl) is a thin
36+
/// wrapper around `generic_simd_op` which in turn uses SIMD operation structs like:
37+
/// - `L2FloatOp<16>` - L2 distance using AVX-512 floating-point operations
38+
/// - `IPFloatOp<16>` - Inner product using AVX-512 floating-point operations
39+
/// - `L2VNNIOp<int16_t, 32>` - L2 using AVX-512 VNNI (integer operations)
40+
/// - `IPVNNIOp<int16_t, 32>` - Inner product using AVX-512 VNNI
41+
///
42+
/// These SIMD ops are defined in the distance headers and contain the actual AVX-512
43+
/// intrinsics. The instantiations here ensure this AVX-512 code is generated.
44+
///
45+
/// ## Dimensions Instantiated
46+
///
47+
/// We instantiate for the following dimensionalities:
48+
/// - Fixed: 64, 96, 100, 128, 160, 200, 512, 768
49+
/// - Dynamic: For runtime-determined dimensions
50+
///
51+
/// For each dimension, 16 type combinations are instantiated (4 element types × 4):
52+
/// float, int8_t, uint8_t, Float16
53+
///
54+
1755
#if defined(__x86_64__)
1856
#include "svs/core/distance/cosine.h"
1957
#include "svs/core/distance/euclidean.h"
@@ -22,6 +60,10 @@
2260
namespace svs::distance {
2361

2462
// TODO: connect with dim_supported_list
63+
64+
// ============================================================================
65+
// L2 (Euclidean) Distance Instantiations
66+
// ============================================================================
2567
DISTANCE_L2_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512);
2668
DISTANCE_L2_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512);
2769
DISTANCE_L2_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512);
@@ -32,6 +74,9 @@ DISTANCE_L2_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX512);
3274
DISTANCE_L2_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX512);
3375
DISTANCE_L2_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512);
3476

77+
// ============================================================================
78+
// Inner Product Instantiations
79+
// ============================================================================
3580
DISTANCE_IP_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512);
3681
DISTANCE_IP_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512);
3782
DISTANCE_IP_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512);
@@ -42,6 +87,9 @@ DISTANCE_IP_INSTANTIATE_TEMPLATE(512, AVX_AVAILABILITY::AVX512);
4287
DISTANCE_IP_INSTANTIATE_TEMPLATE(768, AVX_AVAILABILITY::AVX512);
4388
DISTANCE_IP_INSTANTIATE_TEMPLATE(Dynamic, AVX_AVAILABILITY::AVX512);
4489

90+
// ============================================================================
91+
// Cosine Similarity Instantiations
92+
// ============================================================================
4593
DISTANCE_CS_INSTANTIATE_TEMPLATE(64, AVX_AVAILABILITY::AVX512);
4694
DISTANCE_CS_INSTANTIATE_TEMPLATE(96, AVX_AVAILABILITY::AVX512);
4795
DISTANCE_CS_INSTANTIATE_TEMPLATE(100, AVX_AVAILABILITY::AVX512);

include/svs/multi-arch/x86/preprocessor.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,43 @@
1616

1717
#pragma once
1818

19+
///
20+
/// @file preprocessor.h
21+
/// @brief Macros for explicit instantiation of distance implementations
22+
///
23+
/// This file contains macros to systematically generate explicit template instantiations
24+
/// for distance implementations (L2Impl, IPImpl, CosineSimilarityImpl).
25+
///
26+
/// ## Why Explicit Instantiation?
27+
///
28+
/// The library supports runtime ISA dispatch - detecting AVX512/AVX2 support at runtime
29+
/// and calling the appropriate optimized implementation. This requires:
30+
/// 1. Separate compilation with architecture-specific compiler flags
31+
/// 2. Explicit instantiation of templates in those compilation units
32+
///
33+
/// Without explicit instantiation, the templates would be instantiated inline wherever
34+
/// used, which would prevent proper ISA-specific optimization.
35+
///
36+
/// ## Architecture
37+
///
38+
/// Distance implementations (e.g., `L2Impl`) are thin wrappers that call `generic_simd_op`
39+
/// with a SIMD operation struct (e.g., `L2FloatOp<16>`). The SIMD ops contain the actual
40+
/// AVX intrinsics. By explicitly instantiating the distance implementations in files
41+
/// compiled with `-march=cascadelake` or `-march=haswell`, we ensure the AVX code is
42+
/// generated with appropriate optimizations.
43+
///
44+
/// ## Type Combinations
45+
///
46+
/// Each macro instantiates 16 type combinations (4 element types × 4 element types):
47+
/// - float, int8_t, uint8_t, Float16
48+
///
49+
/// This covers all supported mixed-type distance computations.
50+
///
51+
52+
/// Helper macro for L2 distance explicit instantiation
53+
/// @param SPEC Either `template` (for definitions) or `extern template` (for declarations)
54+
/// @param N Dimensionality (e.g., 64, 128, Dynamic)
55+
/// @param AVX AVX availability level (AVX_AVAILABILITY::AVX512 or AVX_AVAILABILITY::AVX2)
1956
#define DISTANCE_L2_TEMPLATE_HELPER(SPEC, N, AVX) \
2057
SPEC struct L2Impl<N, float, float, AVX>; \
2158
SPEC struct L2Impl<N, float, int8_t, AVX>; \
@@ -34,12 +71,18 @@
3471
SPEC struct L2Impl<N, svs::float16::Float16, uint8_t, AVX>; \
3572
SPEC struct L2Impl<N, svs::float16::Float16, svs::float16::Float16, AVX>;
3673

74+
/// Instantiate L2 distance implementations (use in .cpp files)
3775
#define DISTANCE_L2_INSTANTIATE_TEMPLATE(N, AVX) \
3876
DISTANCE_L2_TEMPLATE_HELPER(template, N, AVX);
3977

78+
/// Declare external L2 distance implementations (use in .h files)
4079
#define DISTANCE_L2_EXTERN_TEMPLATE(N, AVX) \
4180
DISTANCE_L2_TEMPLATE_HELPER(extern template, N, AVX);
4281

82+
/// Helper macro for Inner Product explicit instantiation
83+
/// @param SPEC Either `template` (for definitions) or `extern template` (for declarations)
84+
/// @param N Dimensionality (e.g., 64, 128, Dynamic)
85+
/// @param AVX AVX availability level (AVX_AVAILABILITY::AVX512 or AVX_AVAILABILITY::AVX2)
4386
#define DISTANCE_IP_TEMPLATE_HELPER(SPEC, N, AVX) \
4487
SPEC struct IPImpl<N, float, float, AVX>; \
4588
SPEC struct IPImpl<N, float, int8_t, AVX>; \
@@ -58,12 +101,18 @@
58101
SPEC struct IPImpl<N, svs::float16::Float16, uint8_t, AVX>; \
59102
SPEC struct IPImpl<N, svs::float16::Float16, svs::float16::Float16, AVX>;
60103

104+
/// Instantiate Inner Product implementations (use in .cpp files)
61105
#define DISTANCE_IP_INSTANTIATE_TEMPLATE(N, AVX) \
62106
DISTANCE_IP_TEMPLATE_HELPER(template, N, AVX);
63107

108+
/// Declare external Inner Product implementations (use in .h files)
64109
#define DISTANCE_IP_EXTERN_TEMPLATE(N, AVX) \
65110
DISTANCE_IP_TEMPLATE_HELPER(extern template, N, AVX);
66111

112+
/// Helper macro for Cosine Similarity explicit instantiation
113+
/// @param SPEC Either `template` (for definitions) or `extern template` (for declarations)
114+
/// @param N Dimensionality (e.g., 64, 128, Dynamic)
115+
/// @param AVX AVX availability level (AVX_AVAILABILITY::AVX512 or AVX_AVAILABILITY::AVX2)
67116
#define DISTANCE_CS_TEMPLATE_HELPER(SPEC, N, AVX) \
68117
SPEC struct CosineSimilarityImpl<N, float, float, AVX>; \
69118
SPEC struct CosineSimilarityImpl<N, float, int8_t, AVX>; \
@@ -82,8 +131,10 @@
82131
SPEC struct CosineSimilarityImpl<N, svs::float16::Float16, uint8_t, AVX>; \
83132
SPEC struct CosineSimilarityImpl<N, svs::float16::Float16, svs::float16::Float16, AVX>;
84133

134+
/// Instantiate Cosine Similarity implementations (use in .cpp files)
85135
#define DISTANCE_CS_INSTANTIATE_TEMPLATE(N, AVX) \
86136
DISTANCE_CS_TEMPLATE_HELPER(template, N, AVX);
87137

138+
/// Declare external Cosine Similarity implementations (use in .h files)
88139
#define DISTANCE_CS_EXTERN_TEMPLATE(N, AVX) \
89140
DISTANCE_CS_TEMPLATE_HELPER(extern template, N, AVX);

0 commit comments

Comments
 (0)