From e660cb685caa25fcbf69db08945382f0599da066 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Wed, 31 Jul 2024 10:19:44 -0400 Subject: [PATCH] dp4a for CC < 610 --- .vscode/settings.json | 5 ++++- candle-kernels/src/quantized.cu | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index b2dbd68012..f61bfe66ff 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,8 @@ "candle-pyo3" ], "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true + "python.testing.pytestEnabled": true, + "files.associations": { + "cstdint": "cpp" + } } \ No newline at end of file diff --git a/candle-kernels/src/quantized.cu b/candle-kernels/src/quantized.cu index 05f878f3d6..9866add012 100644 --- a/candle-kernels/src/quantized.cu +++ b/candle-kernels/src/quantized.cu @@ -82,6 +82,19 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * #define CC_RDNA2 (CC_OFFSET_AMD + 1030) #define CC_RDNA3 (CC_OFFSET_AMD + 1100) +#if __CUDA_ARCH__ < MIN_CC_DP4A +// Manually perform 8-bit dot product +__device__ int dp4a(int a, int b, int c) { + int result = c; + for (int i = 0; i < 4; ++i) { + int8_t a_byte = (a >> (i * 8)) & 0xFF; + int8_t b_byte = (b >> (i * 8)) & 0xFF; + result += a_byte * b_byte; + } + return result; +} +#endif + #define MMQ_X_Q4_0_RDNA2 64 #define MMQ_Y_Q4_0_RDNA2 128 #define NWARPS_Q4_0_RDNA2 8