From 7d30e8742f3b2093c9e697b996b6be722c20986b Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Wed, 4 Feb 2026 15:55:08 +0100 Subject: [PATCH 01/11] implement `carryless_mul` --- compiler/rustc_codegen_llvm/src/intrinsic.rs | 19 +++++++ .../rustc_hir_analysis/src/check/intrinsic.rs | 5 +- compiler/rustc_span/src/symbol.rs | 2 + library/core/src/intrinsics/fallback.rs | 54 +++++++++++++++++++ library/core/src/intrinsics/mod.rs | 13 +++++ library/core/src/intrinsics/simd.rs | 12 +++++ library/core/src/lib.rs | 1 + library/core/src/num/mod.rs | 30 +++++++++-- library/core/src/num/uint_macros.rs | 29 ++++++++++ library/coretests/tests/lib.rs | 1 + library/coretests/tests/num/uint_macros.rs | 7 +++ library/std/src/lib.rs | 1 + 12 files changed, 170 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index e035f0809d685..570e9602a60ce 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -396,6 +396,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { | sym::bitreverse | sym::saturating_add | sym::saturating_sub + | sym::carryless_mul | sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => { let ty = args[0].layout.ty; @@ -438,6 +439,11 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { sym::bitreverse => { self.call_intrinsic("llvm.bitreverse", &[llty], &[args[0].immediate()]) } + sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => { + let lhs = args[0].immediate(); + let rhs = args[1].immediate(); + self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs]) + } sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => { let is_left = name == sym::unchecked_funnel_shl; let lhs = args[0].immediate(); @@ -2763,6 +2769,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>( | sym::simd_ctlz | sym::simd_ctpop | sym::simd_cttz + | sym::simd_carryless_mul | sym::simd_funnel_shl | sym::simd_funnel_shr ) { @@ -2787,6 +2794,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>( sym::simd_cttz => "llvm.cttz", sym::simd_funnel_shl => "llvm.fshl", sym::simd_funnel_shr => "llvm.fshr", + sym::simd_carryless_mul => "llvm.clmul", _ => unreachable!(), }; let int_size = in_elem.int_size_and_signed(bx.tcx()).0.bits(); @@ -2812,6 +2820,17 @@ fn generic_simd_intrinsic<'ll, 'tcx>( &[vec_ty], &[args[0].immediate(), args[1].immediate(), args[2].immediate()], )), + sym::simd_carryless_mul => { + if crate::llvm_util::get_version() >= (22, 0, 0) { + Ok(bx.call_intrinsic( + llvm_intrinsic, + &[vec_ty], + &[args[0].immediate(), args[1].immediate()], + )) + } else { + span_bug!(span, "`simd_carryless_mul` needs LLVM 22 or higher"); + } + } _ => unreachable!(), }; } diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index 22ee490b81a7b..6946d1a70040d 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -82,6 +82,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::bswap | sym::caller_location | sym::carrying_mul_add + | sym::carryless_mul | sym::ceilf16 | sym::ceilf32 | sym::ceilf64 @@ -564,6 +565,7 @@ pub(crate) fn check_intrinsic_type( (1, 0, vec![param(0), param(0)], param(0)) } sym::saturating_add | sym::saturating_sub => (1, 0, vec![param(0), param(0)], param(0)), + sym::carryless_mul => (1, 0, vec![param(0), param(0)], param(0)), sym::fadd_fast | sym::fsub_fast | sym::fmul_fast | sym::fdiv_fast | sym::frem_fast => { (1, 0, vec![param(0), param(0)], param(0)) } @@ -711,7 +713,8 @@ pub(crate) fn check_intrinsic_type( | sym::simd_fmin | sym::simd_fmax | sym::simd_saturating_add - | sym::simd_saturating_sub => (1, 0, vec![param(0), param(0)], param(0)), + | sym::simd_saturating_sub + | sym::simd_carryless_mul => (1, 0, vec![param(0), param(0)], param(0)), sym::simd_arith_offset => (2, 0, vec![param(0), param(1)], param(0)), sym::simd_neg | sym::simd_bswap diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index aac4cf1de8c2b..4ffe813440b82 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -642,6 +642,7 @@ symbols! { caller_location, capture_disjoint_fields, carrying_mul_add, + carryless_mul, catch_unwind, cause, cdylib, @@ -2083,6 +2084,7 @@ symbols! { simd_bitmask, simd_bitreverse, simd_bswap, + simd_carryless_mul, simd_cast, simd_cast_ptr, simd_ceil, diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs index 932537f2581f8..a9b610ae2cbc1 100644 --- a/library/core/src/intrinsics/fallback.rs +++ b/library/core/src/intrinsics/fallback.rs @@ -218,3 +218,57 @@ macro_rules! impl_funnel_shifts { impl_funnel_shifts! { u8, u16, u32, u64, u128, usize } + +#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] +pub const trait CarrylessMul: Copy + 'static { + /// See [`super::carryless_mul`]; we just need the trait indirection to handle + /// different types since calling intrinsics with generics doesn't work. + fn carryless_mul(self, rhs: Self) -> Self; +} + +macro_rules! impl_carryless_mul{ + ($($type:ident),*) => {$( + /// This approach uses a bitmask of the form `0b100010001...0001` to avoid carry spilling. + /// When carries do occur, they wind up in a "hole" of zeros and are subsequently masked + /// out of the result. + #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] + impl const CarrylessMul for $type { + #[inline] + fn carryless_mul(self, rhs: Self) -> Self { + use crate::num::Wrapping; + + // i.e. 0b100010001...0001 in binary. + const MASK: u128 = 0x1111_1111_1111_1111_1111_1111_1111_1111; + + let m0 = MASK as $type; + let x = self; + let y = rhs; + + let m1 = m0 << 1; + let m2 = m1 << 1; + let m3 = m2 << 1; + + let x0 = Wrapping(x & m0); + let x1 = Wrapping(x & m1); + let x2 = Wrapping(x & m2); + let x3 = Wrapping(x & m3); + + let y0 = Wrapping(y & m0); + let y1 = Wrapping(y & m1); + let y2 = Wrapping(y & m2); + let y3 = Wrapping(y & m3); + + let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); + let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); + let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); + let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); + + (z0.0 & m0) | (z1.0 & m1) | (z2.0 & m2) | (z3.0 & m3) + } + } + )*}; +} + +impl_carryless_mul! { + u8, u16, u32, u64, u128, usize +} diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 051dda731881f..1821497640251 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -2179,6 +2179,19 @@ pub const unsafe fn unchecked_funnel_shr( unsafe { a.unchecked_funnel_shr(b, shift) } } +/// Carryless multiply. +/// +/// Safe versions of this intrinsic are available on the integer primitives +/// via the `carryless_mul` method. For example, [`u32::carryless_mul`]. +#[rustc_intrinsic] +#[rustc_nounwind] +#[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] +#[unstable(feature = "uint_carryless_mul", issue = "152080")] +#[miri::intrinsic_fallback_is_spec] +pub const fn carryless_mul(a: T, b: T) -> T { + a.carryless_mul(b) +} + /// This is an implementation detail of [`crate::ptr::read`] and should /// not be used anywhere else. See its comments for why this exists. /// diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs index f70262c38ae50..5fb2102c319e2 100644 --- a/library/core/src/intrinsics/simd.rs +++ b/library/core/src/intrinsics/simd.rs @@ -162,6 +162,18 @@ pub const unsafe fn simd_funnel_shl(a: T, b: T, shift: T) -> T; #[rustc_nounwind] pub const unsafe fn simd_funnel_shr(a: T, b: T, shift: T) -> T; +/// Compute the carry-less product. +/// +/// This is similar to long multiplication except that the carry is discarded. +/// +/// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial +/// ring over `GF(2)`. +/// +/// `T` must be a vector of integers. +#[rustc_intrinsic] +#[rustc_nounwind] +pub unsafe fn simd_carryless_mul(a: T, b: T) -> T; + /// "And"s vectors elementwise. /// /// `T` must be a vector of integers. diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 432ca50b33613..c1568b58b5fd3 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -188,6 +188,7 @@ #![feature(trait_alias)] #![feature(transparent_unions)] #![feature(try_blocks)] +#![feature(uint_carryless_mul)] #![feature(unboxed_closures)] #![feature(unsized_fn_params)] #![feature(with_negative_coherence)] diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index 558426c94e5dc..aa3a3e5d3ca41 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -458,6 +458,9 @@ impl u8 { fsh_op = "0x36", fshl_result = "0x8", fshr_result = "0x8d", + clmul_lhs = "0x12", + clmul_rhs = "0x34", + clmul_result = "0x28", swap_op = "0x12", swapped = "0x12", reversed = "0x48", @@ -1095,6 +1098,9 @@ impl u16 { fsh_op = "0x2de", fshl_result = "0x30", fshr_result = "0x302d", + clmul_lhs = "0x9012", + clmul_rhs = "0xcd34", + clmul_result = "0x928", swap_op = "0x1234", swapped = "0x3412", reversed = "0x2c48", @@ -1145,6 +1151,9 @@ impl u32 { fsh_op = "0x2fe78e45", fshl_result = "0xb32f", fshr_result = "0xb32fe78e", + clmul_lhs = "0x56789012", + clmul_rhs = "0xf52ecd34", + clmul_result = "0x9b980928", swap_op = "0x12345678", swapped = "0x78563412", reversed = "0x1e6a2c48", @@ -1171,6 +1180,9 @@ impl u64 { fsh_op = "0x2fe78e45983acd98", fshl_result = "0x6e12fe", fshr_result = "0x6e12fe78e45983ac", + clmul_lhs = "0x7890123456789012", + clmul_rhs = "0xdd358416f52ecd34", + clmul_result = "0xa6299579b980928", swap_op = "0x1234567890123456", swapped = "0x5634129078563412", reversed = "0x6a2c48091e6a2c48", @@ -1197,6 +1209,9 @@ impl u128 { fsh_op = "0x2fe78e45983acd98039000008736273", fshl_result = "0x4f7602fe", fshr_result = "0x4f7602fe78e45983acd9803900000873", + clmul_lhs = "0x12345678901234567890123456789012", + clmul_rhs = "0x4317e40ab4ddcf05dd358416f52ecd34", + clmul_result = "0xb9cf660de35d0c170a6299579b980928", swap_op = "0x12345678901234567890123456789012", swapped = "0x12907856341290785634129078563412", reversed = "0x48091e6a2c48091e6a2c48091e6a2c48", @@ -1223,9 +1238,12 @@ impl usize { rot = 4, rot_op = "0xa003", rot_result = "0x3a", - fsh_op = "0x2fe78e45983acd98039000008736273", - fshl_result = "0x4f7602fe", - fshr_result = "0x4f7602fe78e45983acd9803900000873", + fsh_op = "0x2de", + fshl_result = "0x30", + fshr_result = "0x302d", + clmul_lhs = "0x9012", + clmul_rhs = "0xcd34", + clmul_result = "0x928", swap_op = "0x1234", swapped = "0x3412", reversed = "0x2c48", @@ -1253,6 +1271,9 @@ impl usize { fsh_op = "0x2fe78e45", fshl_result = "0xb32f", fshr_result = "0xb32fe78e", + clmul_lhs = "0x56789012", + clmul_rhs = "0xf52ecd34", + clmul_result = "0x9b980928", swap_op = "0x12345678", swapped = "0x78563412", reversed = "0x1e6a2c48", @@ -1280,6 +1301,9 @@ impl usize { fsh_op = "0x2fe78e45983acd98", fshl_result = "0x6e12fe", fshr_result = "0x6e12fe78e45983ac", + clmul_lhs = "0x7890123456789012", + clmul_rhs = "0xdd358416f52ecd34", + clmul_result = "0xa6299579b980928", swap_op = "0x1234567890123456", swapped = "0x5634129078563412", reversed = "0x6a2c48091e6a2c48", diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs index 5c263ea845cc2..3d045fb913edc 100644 --- a/library/core/src/num/uint_macros.rs +++ b/library/core/src/num/uint_macros.rs @@ -17,6 +17,9 @@ macro_rules! uint_impl { fsh_op = $fsh_op:literal, fshl_result = $fshl_result:literal, fshr_result = $fshr_result:literal, + clmul_lhs = $clmul_rhs:literal, + clmul_rhs = $clmul_lhs:literal, + clmul_result = $clmul_result:literal, swap_op = $swap_op:literal, swapped = $swapped:literal, reversed = $reversed:literal, @@ -482,6 +485,32 @@ macro_rules! uint_impl { unsafe { intrinsics::unchecked_funnel_shr(self, rhs, n) } } + /// Performs a carry-less multiplication. + /// + /// This is similar to long multiplication except that the carry is discarded. + /// This function wraps, so only the low bits are returned. + /// + /// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial + /// ring over `GF(2)`. + /// + /// ``` + /// #![feature(uint_carryless_mul)] + /// + #[doc = concat!("let a = ", $clmul_lhs, stringify!($SelfT), ";")] + #[doc = concat!("let b = ", $clmul_rhs, stringify!($SelfT), ";")] + /// + #[doc = concat!("assert_eq!(a.carryless_mul(b), ", $clmul_result, ");")] + /// ``` + #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] + #[doc(alias = "clmul")] + #[unstable(feature = "uint_carryless_mul", issue = "152080")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline(always)] + pub const fn carryless_mul(self, rhs: Self) -> Self { + intrinsics::carryless_mul(self, rhs) + } + /// Reverses the byte order of the integer. /// /// # Examples diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs index d085e4ad1a8fe..f3b36ef9092b7 100644 --- a/library/coretests/tests/lib.rs +++ b/library/coretests/tests/lib.rs @@ -120,6 +120,7 @@ #![feature(try_trait_v2)] #![feature(type_info)] #![feature(uint_bit_width)] +#![feature(uint_carryless_mul)] #![feature(uint_gather_scatter_bits)] #![feature(unsize)] #![feature(unwrap_infallible)] diff --git a/library/coretests/tests/num/uint_macros.rs b/library/coretests/tests/num/uint_macros.rs index 7c4fb22599c03..240c66fd5c715 100644 --- a/library/coretests/tests/num/uint_macros.rs +++ b/library/coretests/tests/num/uint_macros.rs @@ -117,6 +117,13 @@ macro_rules! uint_module { assert_eq_const_safe!($T: <$T>::funnel_shr(_1, _1, 4), <$T>::rotate_right(_1, 4)); } + fn test_carryless_mul() { + assert_eq_const_safe!($T: <$T>::carryless_mul(0, 0), 0); + assert_eq_const_safe!($T: <$T>::carryless_mul(1, 1), 1); + + assert_eq_const_safe!($T: <$T>::carryless_mul(0b0100, 2), 0b1000); + } + fn test_swap_bytes() { assert_eq_const_safe!($T: A.swap_bytes().swap_bytes(), A); assert_eq_const_safe!($T: B.swap_bytes().swap_bytes(), B); diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index dcde208fac77b..12fa3efa84803 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -315,6 +315,7 @@ #![feature(try_blocks)] #![feature(try_trait_v2)] #![feature(type_alias_impl_trait)] +#![feature(uint_carryless_mul)] // tidy-alphabetical-end // // Library features (core): From b11c3d58dcc3cc483445656389c855eb581a7e96 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Thu, 5 Feb 2026 22:16:43 +0100 Subject: [PATCH 02/11] use the simple, inefficient implementation in rustc_const_eval --- .../src/interpret/intrinsics.rs | 27 +++++++++++++++++++ library/core/src/intrinsics/mod.rs | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs index e526f6120689a..09922d401657d 100644 --- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs +++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs @@ -733,6 +733,33 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> { sym::fmuladdf128 => { self.float_muladd_intrinsic::(args, dest, MulAddType::Nondeterministic)? } + sym::carryless_mul => { + let size = dest.layout.size; + + let left = self.read_scalar(&args[0])?.to_bits(size)?; + let right = self.read_scalar(&args[1])?.to_bits(size)?; + + // perform carry-less multiplication. + // + // this operation is like long multiplication, but ignores the carries. + // that idea corresponds to the xor operator, which is used in the implementation. + // + // wikipedia has an example https://en.wikipedia.org/wiki/carry-less_product#example + let mut result: u128 = 0; + + for i in 0..size.bits() { + // if the i-th bit in right is set + if (right >> i) & 1 != 0 { + // xor result with `left` shifted to the left by i positions + result ^= left << i; + } + } + + // Only return the lower bits. + result &= u128::MAX >> (128 - size.bits()); + + self.write_scalar(Scalar::from_uint(result, dest.layout.size), dest)?; + } // Unsupported intrinsic: skip the return_to_block below. _ => return interp_ok(false), diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 1821497640251..7c6dbfdb7ab70 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -2187,8 +2187,9 @@ pub const unsafe fn unchecked_funnel_shr( #[rustc_nounwind] #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] #[unstable(feature = "uint_carryless_mul", issue = "152080")] -#[miri::intrinsic_fallback_is_spec] pub const fn carryless_mul(a: T, b: T) -> T { + // NOTE: while this implementation could serve as the specification, rustc_const_eval + // actually implements a simpler but less efficient variant as the specification. a.carryless_mul(b) } From fc1448f0d574fc015a32041fb92049cacc061a7f Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Thu, 5 Feb 2026 22:17:34 +0100 Subject: [PATCH 03/11] add a custom fallback for u128 (the generic version would run into carries) --- library/core/src/intrinsics/fallback.rs | 57 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs index a9b610ae2cbc1..4f4e125a0ae4e 100644 --- a/library/core/src/intrinsics/fallback.rs +++ b/library/core/src/intrinsics/fallback.rs @@ -238,7 +238,7 @@ macro_rules! impl_carryless_mul{ use crate::num::Wrapping; // i.e. 0b100010001...0001 in binary. - const MASK: u128 = 0x1111_1111_1111_1111_1111_1111_1111_1111; + const MASK: u64 = 0x1111_1111_1111_1111; let m0 = MASK as $type; let x = self; @@ -270,5 +270,58 @@ macro_rules! impl_carryless_mul{ } impl_carryless_mul! { - u8, u16, u32, u64, u128, usize + u8, u16, u32, u64, usize +} + +#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] +impl const CarrylessMul for u128 { + #[inline] + fn carryless_mul(self, rhs: Self) -> Self { + // For u128 the 0b100010001...0001 trick above does not work, so we use an implementation + // that uses 64-bit carryless multiplication. + karatsuba_u128(self, rhs).1 + } +} + +#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] +const fn karatsuba_u128(h: u128, y: u128) -> (u128, u128) { + // Karatsuba input decomposition for H + let (h1, h0) = ((h >> 64) as u64, h as u64); + let h0r = h0.reverse_bits(); + let h1r = h1.reverse_bits(); + let h2 = h0 ^ h1; + let h2r = h0r ^ h1r; + + // Karatsuba input decomposition for Y + let (y1, y0) = ((y >> 64) as u64, y as u64); + let y0r = y0.reverse_bits(); + let y1r = y1.reverse_bits(); + let y2 = y0 ^ y1; + let y2r = y0r ^ y1r; + + // Perform carryless multiplications + let z0 = y0.carryless_mul(h0); + let z1 = y1.carryless_mul(h1); + let mut z2 = y2.carryless_mul(h2); + let mut z0h = y0r.carryless_mul(h0r); + let mut z1h = y1r.carryless_mul(h1r); + let mut z2h = y2r.carryless_mul(h2r); + + // Karatsuba recombination + z2 ^= z0 ^ z1; + z2h ^= z0h ^ z1h; + z0h = z0h.reverse_bits() >> 1; + z1h = z1h.reverse_bits() >> 1; + z2h = z2h.reverse_bits() >> 1; + + // Assemble the final 256-bit product + let v0 = z0; + let v1 = z0h ^ z2; + let v2 = z1 ^ z2h; + let v3 = z1h; + + let high = ((v3 as u128) << 64) | v2 as u128; + let low = ((v1 as u128) << 64) | v0 as u128; + + (high, low) } From 3cef0de57a67dc7401e28407d2f62bf16eec7b78 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 00:01:07 +0100 Subject: [PATCH 04/11] use mask constants in the generic impl --- library/core/src/intrinsics/fallback.rs | 30 ++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs index 4f4e125a0ae4e..19be1e5d2a2f3 100644 --- a/library/core/src/intrinsics/fallback.rs +++ b/library/core/src/intrinsics/fallback.rs @@ -238,32 +238,32 @@ macro_rules! impl_carryless_mul{ use crate::num::Wrapping; // i.e. 0b100010001...0001 in binary. - const MASK: u64 = 0x1111_1111_1111_1111; + const MASK: u64 = 0x1111_1111_1111_1111u64; + + const M0: $type = MASK as $type; + const M1: $type = M0 << 1; + const M2: $type = M1 << 1; + const M3: $type = M2 << 1; - let m0 = MASK as $type; let x = self; let y = rhs; - let m1 = m0 << 1; - let m2 = m1 << 1; - let m3 = m2 << 1; - - let x0 = Wrapping(x & m0); - let x1 = Wrapping(x & m1); - let x2 = Wrapping(x & m2); - let x3 = Wrapping(x & m3); + let x0 = Wrapping(x & M0); + let x1 = Wrapping(x & M1); + let x2 = Wrapping(x & M2); + let x3 = Wrapping(x & M3); - let y0 = Wrapping(y & m0); - let y1 = Wrapping(y & m1); - let y2 = Wrapping(y & m2); - let y3 = Wrapping(y & m3); + let y0 = Wrapping(y & M0); + let y1 = Wrapping(y & M1); + let y2 = Wrapping(y & M2); + let y3 = Wrapping(y & M3); let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); - (z0.0 & m0) | (z1.0 & m1) | (z2.0 & m2) | (z3.0 & m3) + (z0.0 & M0) | (z1.0 & M1) | (z2.0 & M2) | (z3.0 & M3) } } )*}; From 2220317fe0d5265dedf950f675c68c5b08d33f5a Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 00:01:15 +0100 Subject: [PATCH 05/11] update docs --- library/core/src/num/uint_macros.rs | 40 +++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs index 3d045fb913edc..30b340cba8fd3 100644 --- a/library/core/src/num/uint_macros.rs +++ b/library/core/src/num/uint_macros.rs @@ -485,13 +485,43 @@ macro_rules! uint_impl { unsafe { intrinsics::unchecked_funnel_shr(self, rhs, n) } } - /// Performs a carry-less multiplication. + /// Performs a carry-less multiplication, returning the lower bits. + /// + /// This operation is similar to long multiplication, except that exclusive or is used + /// instead of addition. The implementation is equivalent to: + /// + /// ```no_run + #[doc = concat!("pub fn carryless_mul(lhs: ", stringify!($SelfT), ", rhs: ", stringify!($SelfT), ") -> ", stringify!($SelfT), "{")] + /// let mut retval = 0; + #[doc = concat!(" for i in 0..", stringify!($SelfT), "::BITS {")] + /// if (rhs >> i) & 1 != 0 { + /// // long multiplication would use += + /// retval ^= lhs << i; + /// } + /// } + /// retval + /// } + /// ``` + /// + /// The actual implementation is more efficient, and on some plaforms lowers directly to a + /// dedicated instruction. /// - /// This is similar to long multiplication except that the carry is discarded. - /// This function wraps, so only the low bits are returned. + /// # Uses /// - /// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial - /// ring over `GF(2)`. + /// Carryless multiplication can be used to turn a bitmask of quote characters into a + /// bit mask of characters surrounded by quotes: + /// + /// ```no_run + /// r#"abc xxx "foobar" zzz "a"!"#; // input string + /// 0b0000000010000001000001010; // quote_mask + /// 0b0000000001111110000000100; // quote_mask.carryless_mul(!0) & !quote_mask + /// ``` + /// + /// Another use is in cryptography, where carryless multiplication allows for efficient + /// implementations of polynomial multiplication in `GF(2)[X]`, the polynomial ring + /// over `GF(2)`. + /// + /// # Examples /// /// ``` /// #![feature(uint_carryless_mul)] From ecf4d3fa8bb6de6f7c95d76b5f17cb2424f61467 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 00:28:38 +0100 Subject: [PATCH 06/11] use the fallback with earlier versions of LLVM --- compiler/rustc_codegen_llvm/src/intrinsic.rs | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 570e9602a60ce..0ac068787ff2a 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -387,6 +387,27 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { let pair = self.insert_value(pair, high, 1); pair } + + // FIXME move into the branch below when LLVM 22 is the lowest version we support. + sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => { + let ty = args[0].layout.ty; + if !ty.is_integral() { + tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType { + span, + name, + ty, + }); + return Ok(()); + } + let (size, _) = ty.int_size_and_signed(self.tcx); + let width = size.bits(); + let llty = self.type_ix(width); + + let lhs = args[0].immediate(); + let rhs = args[1].immediate(); + self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs]) + } + sym::ctlz | sym::ctlz_nonzero | sym::cttz @@ -396,7 +417,6 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { | sym::bitreverse | sym::saturating_add | sym::saturating_sub - | sym::carryless_mul | sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => { let ty = args[0].layout.ty; @@ -439,11 +459,6 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { sym::bitreverse => { self.call_intrinsic("llvm.bitreverse", &[llty], &[args[0].immediate()]) } - sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => { - let lhs = args[0].immediate(); - let rhs = args[1].immediate(); - self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs]) - } sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => { let is_left = name == sym::unchecked_funnel_shl; let lhs = args[0].immediate(); From 16a0add2b1ca0db1c725e2c18f11e7024fb94113 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 00:29:23 +0100 Subject: [PATCH 07/11] add `carryless_mul` to `replaced_intrinsics` --- compiler/rustc_codegen_llvm/src/lib.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index bf3ec1f393302..eea4dfc08b7c7 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -345,7 +345,14 @@ impl CodegenBackend for LlvmCodegenBackend { } fn replaced_intrinsics(&self) -> Vec { - vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add] + let mut will_not_use_fallback = + vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add]; + + if llvm_util::get_version() >= (22, 0, 0) { + will_not_use_fallback.push(sym::carryless_mul); + } + + will_not_use_fallback } fn codegen_crate<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Box { From fbc7f99d62f493928f5b6bbc22ed4c144d2d16d6 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 10:57:55 +0100 Subject: [PATCH 08/11] add improved u128 implementation --- library/core/src/intrinsics/fallback.rs | 69 +++++++++++-------------- library/core/src/num/uint_macros.rs | 2 +- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs index 19be1e5d2a2f3..3244ccb8989be 100644 --- a/library/core/src/intrinsics/fallback.rs +++ b/library/core/src/intrinsics/fallback.rs @@ -277,51 +277,42 @@ impl_carryless_mul! { impl const CarrylessMul for u128 { #[inline] fn carryless_mul(self, rhs: Self) -> Self { - // For u128 the 0b100010001...0001 trick above does not work, so we use an implementation - // that uses 64-bit carryless multiplication. - karatsuba_u128(self, rhs).1 + let l = u64::carryless_mul(self as u64, rhs as u64); + let lh = u64::carryless_mul(self as u64, (rhs >> 64) as u64); + let hl = u64::carryless_mul((self >> 64) as u64, rhs as u64); + let h = lh ^ hl ^ carryless_mul_high(self as u64, rhs as u64); + ((h as u128) << 64) | l as u128 } } #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] -const fn karatsuba_u128(h: u128, y: u128) -> (u128, u128) { - // Karatsuba input decomposition for H - let (h1, h0) = ((h >> 64) as u64, h as u64); - let h0r = h0.reverse_bits(); - let h1r = h1.reverse_bits(); - let h2 = h0 ^ h1; - let h2r = h0r ^ h1r; - - // Karatsuba input decomposition for Y - let (y1, y0) = ((y >> 64) as u64, y as u64); - let y0r = y0.reverse_bits(); - let y1r = y1.reverse_bits(); - let y2 = y0 ^ y1; - let y2r = y0r ^ y1r; - - // Perform carryless multiplications - let z0 = y0.carryless_mul(h0); - let z1 = y1.carryless_mul(h1); - let mut z2 = y2.carryless_mul(h2); - let mut z0h = y0r.carryless_mul(h0r); - let mut z1h = y1r.carryless_mul(h1r); - let mut z2h = y2r.carryless_mul(h2r); +#[inline] +const fn carryless_mul_high(x: u64, y: u64) -> u64 { + // i.e. 0b100010001...0001 in binary. + const MASK: u64 = 0x1111_1111_1111_1111u64; - // Karatsuba recombination - z2 ^= z0 ^ z1; - z2h ^= z0h ^ z1h; - z0h = z0h.reverse_bits() >> 1; - z1h = z1h.reverse_bits() >> 1; - z2h = z2h.reverse_bits() >> 1; + const M0: u64 = MASK; + const M1: u64 = M0 << 1; + const M2: u64 = M1 << 1; + const M3: u64 = M2 << 1; - // Assemble the final 256-bit product - let v0 = z0; - let v1 = z0h ^ z2; - let v2 = z1 ^ z2h; - let v3 = z1h; + macro_rules! mul { + ($x_mask_shift:literal, $y_mask_shift:literal) => {{ + let x = x & (MASK << $x_mask_shift); + let y = y & (MASK << $y_mask_shift); + crate::hint::select_unpredictable( + x == MASK << $x_mask_shift && y == MASK << $y_mask_shift, + // only case where the multiply overflows the 4-bit parts + 0x0101_0101_0101_0101u64 << ($x_mask_shift + $y_mask_shift), + x.carrying_mul(y, 0).1, + ) + }}; + } - let high = ((v3 as u128) << 64) | v2 as u128; - let low = ((v1 as u128) << 64) | v0 as u128; + let z0 = mul!(0, 0) ^ mul!(1, 3) ^ mul!(2, 2) ^ mul!(3, 1); + let z1 = mul!(0, 1) ^ mul!(1, 0) ^ mul!(2, 3) ^ mul!(3, 2); + let z2 = mul!(0, 2) ^ mul!(1, 1) ^ mul!(2, 0) ^ mul!(3, 3); + let z3 = mul!(0, 3) ^ mul!(1, 2) ^ mul!(2, 1) ^ mul!(3, 0); - (high, low) + (z0 & M0) | (z1 & M1) | (z2 & M2) | (z3 & M3) } diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs index 30b340cba8fd3..62c0e3e317da4 100644 --- a/library/core/src/num/uint_macros.rs +++ b/library/core/src/num/uint_macros.rs @@ -503,7 +503,7 @@ macro_rules! uint_impl { /// } /// ``` /// - /// The actual implementation is more efficient, and on some plaforms lowers directly to a + /// The actual implementation is more efficient, and on some platforms lowers directly to a /// dedicated instruction. /// /// # Uses From 7404f05e5b173cccce40b19a6b91857f2b5d5e81 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 16:43:50 +0100 Subject: [PATCH 09/11] add `widening_carryless_mul` --- library/core/src/num/mod.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index aa3a3e5d3ca41..7a716b79d4a5b 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -244,6 +244,30 @@ macro_rules! midpoint_impl { }; } +macro_rules! widening_carryless_mul_impl { + ($SelfT:ty, $WideT:ty) => { + /// Performs a widening carry-less multiplication. + /// + /// # Examples + /// + /// ``` + /// #![feature(uint_carryless_mul)] + /// + #[doc = concat!("assert_eq!(", stringify!($SelfT), "::MAX.widening_carryless_mul(", + stringify!($SelfT), "::MAX), ", stringify!($WideT), "::MAX / 3);")] + /// ``` + #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] + #[doc(alias = "clmul")] + #[unstable(feature = "uint_carryless_mul", issue = "152080")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline] + pub const fn widening_carryless_mul(self, rhs: $SelfT) -> $WideT { + (self as $WideT).carryless_mul(rhs as $WideT) + } + }; +} + impl i8 { int_impl! { Self = i8, @@ -471,6 +495,7 @@ impl u8 { bound_condition = "", } midpoint_impl! { u8, u16, unsigned } + widening_carryless_mul_impl! { u8, u16 } /// Checks if the value is within the ASCII range. /// @@ -1111,6 +1136,7 @@ impl u16 { bound_condition = "", } midpoint_impl! { u16, u32, unsigned } + widening_carryless_mul_impl! { u16, u32 } /// Checks if the value is a Unicode surrogate code point, which are disallowed values for [`char`]. /// @@ -1164,6 +1190,7 @@ impl u32 { bound_condition = "", } midpoint_impl! { u32, u64, unsigned } + widening_carryless_mul_impl! { u32, u64 } } impl u64 { @@ -1193,6 +1220,7 @@ impl u64 { bound_condition = "", } midpoint_impl! { u64, u128, unsigned } + widening_carryless_mul_impl! { u64, u128 } } impl u128 { From 0e2315c1ab8f9400e2ab324d84e136a035b1ded9 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Fri, 6 Feb 2026 21:33:04 +0100 Subject: [PATCH 10/11] add `carrying_carryless_mul` --- library/core/src/num/mod.rs | 97 +++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index 7a716b79d4a5b..6771130472d62 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -265,7 +265,96 @@ macro_rules! widening_carryless_mul_impl { pub const fn widening_carryless_mul(self, rhs: $SelfT) -> $WideT { (self as $WideT).carryless_mul(rhs as $WideT) } + } +} + +macro_rules! carrying_carryless_mul_impl { + (u128, u256) => { + carrying_carryless_mul_impl! { @internal u128 => + pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) { + // Karatsuba input decomposition for H + let (h1, h0) = ((self >> 64) as u64, self as u64); + let h0r = h0.reverse_bits(); + let h1r = h1.reverse_bits(); + let h2 = h0 ^ h1; + let h2r = h0r ^ h1r; + + // Karatsuba input decomposition for Y + let (y1, y0) = ((rhs >> 64) as u64, rhs as u64); + let y0r = y0.reverse_bits(); + let y1r = y1.reverse_bits(); + let y2 = y0 ^ y1; + let y2r = y0r ^ y1r; + + // Perform carryless multiplications + let z0 = y0.carryless_mul(h0); + let z1 = y1.carryless_mul(h1); + let mut z2 = y2.carryless_mul(h2); + let mut z0h = y0r.carryless_mul(h0r); + let mut z1h = y1r.carryless_mul(h1r); + let mut z2h = y2r.carryless_mul(h2r); + + // Karatsuba recombination + z2 ^= z0 ^ z1; + z2h ^= z0h ^ z1h; + z0h = z0h.reverse_bits() >> 1; + z1h = z1h.reverse_bits() >> 1; + z2h = z2h.reverse_bits() >> 1; + + // Assemble the final 256-bit product + let v0 = z0; + let v1 = z0h ^ z2; + let v2 = z1 ^ z2h; + let v3 = z1h; + + let hi = ((v3 as u128) << 64) | v2 as u128; + let lo = ((v1 as u128) << 64) | v0 as u128; + + (lo ^ carry, hi) + } + } + }; + ($SelfT:ty, $WideT:ty) => { + carrying_carryless_mul_impl! { @internal $SelfT => + pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) { + // Can't use widening_carryless_mul because it's not implemented for usize. + let p = (self as $WideT).carryless_mul(rhs as $WideT); + + let lo = (p as $SelfT); + let hi = (p >> Self::BITS) as $SelfT; + + (lo ^ carry, hi) + } + } }; + (@internal $SelfT:ty => $($fn:tt)*) => { + /// Calculates the "full carryless multiplication" without the possibility to overflow. + /// + /// This returns the low-order (wrapping) bits and the high-order (overflow) bits + /// of the result as two separate values, in that order. + /// + /// # Examples + /// + /// Please note that this example is shared among integer types, which is why `u8` is used. + /// + /// ``` + /// #![feature(uint_carryless_mul)] + /// + /// assert_eq!(0b1000_0000u8.carrying_carryless_mul(0b1000_0000, 0b0000), (0, 0b0100_0000)); + /// assert_eq!(0b1000_0000u8.carrying_carryless_mul(0b1000_0000, 0b1111), (0b1111, 0b0100_0000)); + #[doc = concat!("assert_eq!(", + stringify!($SelfT), "::MAX.carrying_carryless_mul(", stringify!($SelfT), "::MAX, ", stringify!($SelfT), "::MAX), ", + "(!(", stringify!($SelfT), "::MAX / 3), ", stringify!($SelfT), "::MAX / 3));" + )] + /// ``` + #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] + #[doc(alias = "clmul")] + #[unstable(feature = "uint_carryless_mul", issue = "152080")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline] + $($fn)* + } } impl i8 { @@ -496,6 +585,7 @@ impl u8 { } midpoint_impl! { u8, u16, unsigned } widening_carryless_mul_impl! { u8, u16 } + carrying_carryless_mul_impl! { u8, u16 } /// Checks if the value is within the ASCII range. /// @@ -1137,6 +1227,7 @@ impl u16 { } midpoint_impl! { u16, u32, unsigned } widening_carryless_mul_impl! { u16, u32 } + carrying_carryless_mul_impl! { u16, u32 } /// Checks if the value is a Unicode surrogate code point, which are disallowed values for [`char`]. /// @@ -1191,6 +1282,7 @@ impl u32 { } midpoint_impl! { u32, u64, unsigned } widening_carryless_mul_impl! { u32, u64 } + carrying_carryless_mul_impl! { u32, u64 } } impl u64 { @@ -1221,6 +1313,7 @@ impl u64 { } midpoint_impl! { u64, u128, unsigned } widening_carryless_mul_impl! { u64, u128 } + carrying_carryless_mul_impl! { u64, u128 } } impl u128 { @@ -1252,6 +1345,7 @@ impl u128 { bound_condition = "", } midpoint_impl! { u128, unsigned } + carrying_carryless_mul_impl! { u128, u256 } } #[cfg(target_pointer_width = "16")] @@ -1282,6 +1376,7 @@ impl usize { bound_condition = " on 16-bit targets", } midpoint_impl! { usize, u32, unsigned } + carrying_carryless_mul_impl! { usize, u32 } } #[cfg(target_pointer_width = "32")] @@ -1312,6 +1407,7 @@ impl usize { bound_condition = " on 32-bit targets", } midpoint_impl! { usize, u64, unsigned } + carrying_carryless_mul_impl! { usize, u64 } } #[cfg(target_pointer_width = "64")] @@ -1342,6 +1438,7 @@ impl usize { bound_condition = " on 64-bit targets", } midpoint_impl! { usize, u128, unsigned } + carrying_carryless_mul_impl! { usize, u128 } } impl usize { From 59d6fa7938a7b9f8efe588e47520c54ec2ce20f4 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 8 Feb 2026 11:48:57 +0100 Subject: [PATCH 11/11] improve 128-bit `carrying_carryless_mul` --- library/core/src/num/mod.rs | 59 ++++++++++++++----------------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index 6771130472d62..839a6fbdc9b7e 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -272,43 +272,28 @@ macro_rules! carrying_carryless_mul_impl { (u128, u256) => { carrying_carryless_mul_impl! { @internal u128 => pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) { - // Karatsuba input decomposition for H - let (h1, h0) = ((self >> 64) as u64, self as u64); - let h0r = h0.reverse_bits(); - let h1r = h1.reverse_bits(); - let h2 = h0 ^ h1; - let h2r = h0r ^ h1r; - - // Karatsuba input decomposition for Y - let (y1, y0) = ((rhs >> 64) as u64, rhs as u64); - let y0r = y0.reverse_bits(); - let y1r = y1.reverse_bits(); - let y2 = y0 ^ y1; - let y2r = y0r ^ y1r; - - // Perform carryless multiplications - let z0 = y0.carryless_mul(h0); - let z1 = y1.carryless_mul(h1); - let mut z2 = y2.carryless_mul(h2); - let mut z0h = y0r.carryless_mul(h0r); - let mut z1h = y1r.carryless_mul(h1r); - let mut z2h = y2r.carryless_mul(h2r); - - // Karatsuba recombination - z2 ^= z0 ^ z1; - z2h ^= z0h ^ z1h; - z0h = z0h.reverse_bits() >> 1; - z1h = z1h.reverse_bits() >> 1; - z2h = z2h.reverse_bits() >> 1; - - // Assemble the final 256-bit product - let v0 = z0; - let v1 = z0h ^ z2; - let v2 = z1 ^ z2h; - let v3 = z1h; - - let hi = ((v3 as u128) << 64) | v2 as u128; - let lo = ((v1 as u128) << 64) | v0 as u128; + let x0 = self as u64; + let x1 = (self >> 64) as u64; + let y0 = rhs as u64; + let y1 = (rhs >> 64) as u64; + + let z0 = u64::widening_carryless_mul(x0, y0); + let z2 = u64::widening_carryless_mul(x1, y1); + + // The grade school algorithm would compute: + // z1 = x0y1 ^ x1y0 + + // Instead, Karatsuba first computes: + let z3 = u64::widening_carryless_mul(x0 ^ x1, y0 ^ y1); + // Since it distributes over XOR, + // z3 == x0y0 ^ x0y1 ^ x1y0 ^ x1y1 + // |--| |---------| |--| + // == z0 ^ z1 ^ z2 + // so we can compute z1 as + let z1 = z3 ^ z0 ^ z2; + + let lo = z0 ^ (z1 << 64); + let hi = z2 ^ (z1 >> 64); (lo ^ carry, hi) }