From 7d30e8742f3b2093c9e697b996b6be722c20986b Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Wed, 4 Feb 2026 15:55:08 +0100
Subject: [PATCH 01/11] implement `carryless_mul`

---
 compiler/rustc_codegen_llvm/src/intrinsic.rs  | 19 +++++++
 .../rustc_hir_analysis/src/check/intrinsic.rs |  5 +-
 compiler/rustc_span/src/symbol.rs             |  2 +
 library/core/src/intrinsics/fallback.rs       | 54 +++++++++++++++++++
 library/core/src/intrinsics/mod.rs            | 13 +++++
 library/core/src/intrinsics/simd.rs           | 12 +++++
 library/core/src/lib.rs                       |  1 +
 library/core/src/num/mod.rs                   | 30 +++++++++--
 library/core/src/num/uint_macros.rs           | 29 ++++++++++
 library/coretests/tests/lib.rs                |  1 +
 library/coretests/tests/num/uint_macros.rs    |  7 +++
 library/std/src/lib.rs                        |  1 +
 12 files changed, 170 insertions(+), 4 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index e035f0809d685..570e9602a60ce 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -396,6 +396,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             | sym::bitreverse
             | sym::saturating_add
             | sym::saturating_sub
+            | sym::carryless_mul
             | sym::unchecked_funnel_shl
             | sym::unchecked_funnel_shr => {
                 let ty = args[0].layout.ty;
@@ -438,6 +439,11 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                     sym::bitreverse => {
                         self.call_intrinsic("llvm.bitreverse", &[llty], &[args[0].immediate()])
                     }
+                    sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
+                        let lhs = args[0].immediate();
+                        let rhs = args[1].immediate();
+                        self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs])
+                    }
                     sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => {
                         let is_left = name == sym::unchecked_funnel_shl;
                         let lhs = args[0].immediate();
@@ -2763,6 +2769,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             | sym::simd_ctlz
             | sym::simd_ctpop
             | sym::simd_cttz
+            | sym::simd_carryless_mul
             | sym::simd_funnel_shl
             | sym::simd_funnel_shr
     ) {
@@ -2787,6 +2794,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             sym::simd_cttz => "llvm.cttz",
             sym::simd_funnel_shl => "llvm.fshl",
             sym::simd_funnel_shr => "llvm.fshr",
+            sym::simd_carryless_mul => "llvm.clmul",
             _ => unreachable!(),
         };
         let int_size = in_elem.int_size_and_signed(bx.tcx()).0.bits();
@@ -2812,6 +2820,17 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 &[vec_ty],
                 &[args[0].immediate(), args[1].immediate(), args[2].immediate()],
             )),
+            sym::simd_carryless_mul => {
+                if crate::llvm_util::get_version() >= (22, 0, 0) {
+                    Ok(bx.call_intrinsic(
+                        llvm_intrinsic,
+                        &[vec_ty],
+                        &[args[0].immediate(), args[1].immediate()],
+                    ))
+                } else {
+                    span_bug!(span, "`simd_carryless_mul` needs LLVM 22 or higher");
+                }
+            }
             _ => unreachable!(),
         };
     }
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index 22ee490b81a7b..6946d1a70040d 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -82,6 +82,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
         | sym::bswap
         | sym::caller_location
         | sym::carrying_mul_add
+        | sym::carryless_mul
         | sym::ceilf16
         | sym::ceilf32
         | sym::ceilf64
@@ -564,6 +565,7 @@ pub(crate) fn check_intrinsic_type(
             (1, 0, vec![param(0), param(0)], param(0))
         }
         sym::saturating_add | sym::saturating_sub => (1, 0, vec![param(0), param(0)], param(0)),
+        sym::carryless_mul => (1, 0, vec![param(0), param(0)], param(0)),
         sym::fadd_fast | sym::fsub_fast | sym::fmul_fast | sym::fdiv_fast | sym::frem_fast => {
             (1, 0, vec![param(0), param(0)], param(0))
         }
@@ -711,7 +713,8 @@ pub(crate) fn check_intrinsic_type(
         | sym::simd_fmin
         | sym::simd_fmax
         | sym::simd_saturating_add
-        | sym::simd_saturating_sub => (1, 0, vec![param(0), param(0)], param(0)),
+        | sym::simd_saturating_sub
+        | sym::simd_carryless_mul => (1, 0, vec![param(0), param(0)], param(0)),
         sym::simd_arith_offset => (2, 0, vec![param(0), param(1)], param(0)),
         sym::simd_neg
         | sym::simd_bswap
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index aac4cf1de8c2b..4ffe813440b82 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -642,6 +642,7 @@ symbols! {
         caller_location,
         capture_disjoint_fields,
         carrying_mul_add,
+        carryless_mul,
         catch_unwind,
         cause,
         cdylib,
@@ -2083,6 +2084,7 @@ symbols! {
         simd_bitmask,
         simd_bitreverse,
         simd_bswap,
+        simd_carryless_mul,
         simd_cast,
         simd_cast_ptr,
         simd_ceil,
diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs
index 932537f2581f8..a9b610ae2cbc1 100644
--- a/library/core/src/intrinsics/fallback.rs
+++ b/library/core/src/intrinsics/fallback.rs
@@ -218,3 +218,57 @@ macro_rules! impl_funnel_shifts {
 impl_funnel_shifts! {
     u8, u16, u32, u64, u128, usize
 }
+
+#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
+pub const trait CarrylessMul: Copy + 'static {
+    /// See [`super::carryless_mul`]; we just need the trait indirection to handle
+    /// different types since calling intrinsics with generics doesn't work.
+    fn carryless_mul(self, rhs: Self) -> Self;
+}
+
+macro_rules! impl_carryless_mul{
+    ($($type:ident),*) => {$(
+        /// This approach uses a bitmask of the form `0b100010001...0001` to avoid carry spilling.
+        /// When carries do occur, they wind up in a "hole" of zeros and are subsequently masked
+        /// out of the result.
+        #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
+        impl const CarrylessMul for $type {
+            #[inline]
+            fn carryless_mul(self, rhs: Self) -> Self {
+                use crate::num::Wrapping;
+
+                // i.e. 0b100010001...0001 in binary.
+                const MASK: u128 = 0x1111_1111_1111_1111_1111_1111_1111_1111;
+
+                let m0 = MASK as $type;
+                let x = self;
+                let y = rhs;
+
+                let m1 = m0 << 1;
+                let m2 = m1 << 1;
+                let m3 = m2 << 1;
+
+                let x0 = Wrapping(x & m0);
+                let x1 = Wrapping(x & m1);
+                let x2 = Wrapping(x & m2);
+                let x3 = Wrapping(x & m3);
+
+                let y0 = Wrapping(y & m0);
+                let y1 = Wrapping(y & m1);
+                let y2 = Wrapping(y & m2);
+                let y3 = Wrapping(y & m3);
+
+                let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
+                let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
+                let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
+                let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
+
+                (z0.0 & m0) | (z1.0 & m1) | (z2.0 & m2) | (z3.0 & m3)
+            }
+        }
+    )*};
+}
+
+impl_carryless_mul! {
+    u8, u16, u32, u64, u128, usize
+}
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index 051dda731881f..1821497640251 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -2179,6 +2179,19 @@ pub const unsafe fn unchecked_funnel_shr<T: [const] fallback::FunnelShift>(
     unsafe { a.unchecked_funnel_shr(b, shift) }
 }
 
+/// Carryless multiply.
+///
+/// Safe versions of this intrinsic are available on the integer primitives
+/// via the `carryless_mul` method. For example, [`u32::carryless_mul`].
+#[rustc_intrinsic]
+#[rustc_nounwind]
+#[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
+#[unstable(feature = "uint_carryless_mul", issue = "152080")]
+#[miri::intrinsic_fallback_is_spec]
+pub const fn carryless_mul<T: [const] fallback::CarrylessMul>(a: T, b: T) -> T {
+    a.carryless_mul(b)
+}
+
 /// This is an implementation detail of [`crate::ptr::read`] and should
 /// not be used anywhere else.  See its comments for why this exists.
 ///
diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index f70262c38ae50..5fb2102c319e2 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -162,6 +162,18 @@ pub const unsafe fn simd_funnel_shl<T>(a: T, b: T, shift: T) -> T;
 #[rustc_nounwind]
 pub const unsafe fn simd_funnel_shr<T>(a: T, b: T, shift: T) -> T;
 
+/// Compute the carry-less product.
+///
+/// This is similar to long multiplication except that the carry is discarded.
+///
+/// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial
+/// ring over `GF(2)`.
+///
+/// `T` must be a vector of integers.
+#[rustc_intrinsic]
+#[rustc_nounwind]
+pub unsafe fn simd_carryless_mul<T>(a: T, b: T) -> T;
+
 /// "And"s vectors elementwise.
 ///
 /// `T` must be a vector of integers.
diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
index 432ca50b33613..c1568b58b5fd3 100644
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@@ -188,6 +188,7 @@
 #![feature(trait_alias)]
 #![feature(transparent_unions)]
 #![feature(try_blocks)]
+#![feature(uint_carryless_mul)]
 #![feature(unboxed_closures)]
 #![feature(unsized_fn_params)]
 #![feature(with_negative_coherence)]
diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs
index 558426c94e5dc..aa3a3e5d3ca41 100644
--- a/library/core/src/num/mod.rs
+++ b/library/core/src/num/mod.rs
@@ -458,6 +458,9 @@ impl u8 {
         fsh_op = "0x36",
         fshl_result = "0x8",
         fshr_result = "0x8d",
+        clmul_lhs = "0x12",
+        clmul_rhs = "0x34",
+        clmul_result = "0x28",
         swap_op = "0x12",
         swapped = "0x12",
         reversed = "0x48",
@@ -1095,6 +1098,9 @@ impl u16 {
         fsh_op = "0x2de",
         fshl_result = "0x30",
         fshr_result = "0x302d",
+        clmul_lhs = "0x9012",
+        clmul_rhs = "0xcd34",
+        clmul_result = "0x928",
         swap_op = "0x1234",
         swapped = "0x3412",
         reversed = "0x2c48",
@@ -1145,6 +1151,9 @@ impl u32 {
         fsh_op = "0x2fe78e45",
         fshl_result = "0xb32f",
         fshr_result = "0xb32fe78e",
+        clmul_lhs = "0x56789012",
+        clmul_rhs = "0xf52ecd34",
+        clmul_result = "0x9b980928",
         swap_op = "0x12345678",
         swapped = "0x78563412",
         reversed = "0x1e6a2c48",
@@ -1171,6 +1180,9 @@ impl u64 {
         fsh_op = "0x2fe78e45983acd98",
         fshl_result = "0x6e12fe",
         fshr_result = "0x6e12fe78e45983ac",
+        clmul_lhs = "0x7890123456789012",
+        clmul_rhs = "0xdd358416f52ecd34",
+        clmul_result = "0xa6299579b980928",
         swap_op = "0x1234567890123456",
         swapped = "0x5634129078563412",
         reversed = "0x6a2c48091e6a2c48",
@@ -1197,6 +1209,9 @@ impl u128 {
         fsh_op = "0x2fe78e45983acd98039000008736273",
         fshl_result = "0x4f7602fe",
         fshr_result = "0x4f7602fe78e45983acd9803900000873",
+        clmul_lhs = "0x12345678901234567890123456789012",
+        clmul_rhs = "0x4317e40ab4ddcf05dd358416f52ecd34",
+        clmul_result = "0xb9cf660de35d0c170a6299579b980928",
         swap_op = "0x12345678901234567890123456789012",
         swapped = "0x12907856341290785634129078563412",
         reversed = "0x48091e6a2c48091e6a2c48091e6a2c48",
@@ -1223,9 +1238,12 @@ impl usize {
         rot = 4,
         rot_op = "0xa003",
         rot_result = "0x3a",
-        fsh_op = "0x2fe78e45983acd98039000008736273",
-        fshl_result = "0x4f7602fe",
-        fshr_result = "0x4f7602fe78e45983acd9803900000873",
+        fsh_op = "0x2de",
+        fshl_result = "0x30",
+        fshr_result = "0x302d",
+        clmul_lhs = "0x9012",
+        clmul_rhs = "0xcd34",
+        clmul_result = "0x928",
         swap_op = "0x1234",
         swapped = "0x3412",
         reversed = "0x2c48",
@@ -1253,6 +1271,9 @@ impl usize {
         fsh_op = "0x2fe78e45",
         fshl_result = "0xb32f",
         fshr_result = "0xb32fe78e",
+        clmul_lhs = "0x56789012",
+        clmul_rhs = "0xf52ecd34",
+        clmul_result = "0x9b980928",
         swap_op = "0x12345678",
         swapped = "0x78563412",
         reversed = "0x1e6a2c48",
@@ -1280,6 +1301,9 @@ impl usize {
         fsh_op = "0x2fe78e45983acd98",
         fshl_result = "0x6e12fe",
         fshr_result = "0x6e12fe78e45983ac",
+        clmul_lhs = "0x7890123456789012",
+        clmul_rhs = "0xdd358416f52ecd34",
+        clmul_result = "0xa6299579b980928",
         swap_op = "0x1234567890123456",
         swapped = "0x5634129078563412",
         reversed = "0x6a2c48091e6a2c48",
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index 5c263ea845cc2..3d045fb913edc 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -17,6 +17,9 @@ macro_rules! uint_impl {
         fsh_op = $fsh_op:literal,
         fshl_result = $fshl_result:literal,
         fshr_result = $fshr_result:literal,
+        clmul_lhs = $clmul_rhs:literal,
+        clmul_rhs = $clmul_lhs:literal,
+        clmul_result = $clmul_result:literal,
         swap_op = $swap_op:literal,
         swapped = $swapped:literal,
         reversed = $reversed:literal,
@@ -482,6 +485,32 @@ macro_rules! uint_impl {
             unsafe { intrinsics::unchecked_funnel_shr(self, rhs, n) }
         }
 
+        /// Performs a carry-less multiplication.
+        ///
+        /// This is similar to long multiplication except that the carry is discarded.
+        /// This function wraps, so only the low bits are returned.
+        ///
+        /// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial
+        /// ring over `GF(2)`.
+        ///
+        /// ```
+        /// #![feature(uint_carryless_mul)]
+        ///
+        #[doc = concat!("let a = ", $clmul_lhs, stringify!($SelfT), ";")]
+        #[doc = concat!("let b = ", $clmul_rhs, stringify!($SelfT), ";")]
+        ///
+        #[doc = concat!("assert_eq!(a.carryless_mul(b), ", $clmul_result, ");")]
+        /// ```
+        #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[doc(alias = "clmul")]
+        #[unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[must_use = "this returns the result of the operation, \
+                      without modifying the original"]
+        #[inline(always)]
+        pub const fn carryless_mul(self, rhs: Self) -> Self {
+            intrinsics::carryless_mul(self, rhs)
+        }
+
         /// Reverses the byte order of the integer.
         ///
         /// # Examples
diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs
index d085e4ad1a8fe..f3b36ef9092b7 100644
--- a/library/coretests/tests/lib.rs
+++ b/library/coretests/tests/lib.rs
@@ -120,6 +120,7 @@
 #![feature(try_trait_v2)]
 #![feature(type_info)]
 #![feature(uint_bit_width)]
+#![feature(uint_carryless_mul)]
 #![feature(uint_gather_scatter_bits)]
 #![feature(unsize)]
 #![feature(unwrap_infallible)]
diff --git a/library/coretests/tests/num/uint_macros.rs b/library/coretests/tests/num/uint_macros.rs
index 7c4fb22599c03..240c66fd5c715 100644
--- a/library/coretests/tests/num/uint_macros.rs
+++ b/library/coretests/tests/num/uint_macros.rs
@@ -117,6 +117,13 @@ macro_rules! uint_module {
                 assert_eq_const_safe!($T: <$T>::funnel_shr(_1, _1, 4), <$T>::rotate_right(_1, 4));
             }
 
+            fn test_carryless_mul() {
+                assert_eq_const_safe!($T: <$T>::carryless_mul(0, 0), 0);
+                assert_eq_const_safe!($T: <$T>::carryless_mul(1, 1), 1);
+
+                assert_eq_const_safe!($T: <$T>::carryless_mul(0b0100, 2), 0b1000);
+            }
+
             fn test_swap_bytes() {
                 assert_eq_const_safe!($T: A.swap_bytes().swap_bytes(), A);
                 assert_eq_const_safe!($T: B.swap_bytes().swap_bytes(), B);
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
index dcde208fac77b..12fa3efa84803 100644
--- a/library/std/src/lib.rs
+++ b/library/std/src/lib.rs
@@ -315,6 +315,7 @@
 #![feature(try_blocks)]
 #![feature(try_trait_v2)]
 #![feature(type_alias_impl_trait)]
+#![feature(uint_carryless_mul)]
 // tidy-alphabetical-end
 //
 // Library features (core):

From b11c3d58dcc3cc483445656389c855eb581a7e96 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 5 Feb 2026 22:16:43 +0100
Subject: [PATCH 02/11] use the simple, inefficient implementation in
 rustc_const_eval

---
 .../src/interpret/intrinsics.rs               | 27 +++++++++++++++++++
 library/core/src/intrinsics/mod.rs            |  3 ++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
index e526f6120689a..09922d401657d 100644
--- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs
+++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
@@ -733,6 +733,33 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
             sym::fmuladdf128 => {
                 self.float_muladd_intrinsic::<Quad>(args, dest, MulAddType::Nondeterministic)?
             }
+            sym::carryless_mul => {
+                let size = dest.layout.size;
+
+                let left = self.read_scalar(&args[0])?.to_bits(size)?;
+                let right = self.read_scalar(&args[1])?.to_bits(size)?;
+
+                // perform carry-less multiplication.
+                //
+                // this operation is like long multiplication, but ignores the carries.
+                // that idea corresponds to the xor operator, which is used in the implementation.
+                //
+                // wikipedia has an example https://en.wikipedia.org/wiki/carry-less_product#example
+                let mut result: u128 = 0;
+
+                for i in 0..size.bits() {
+                    // if the i-th bit in right is set
+                    if (right >> i) & 1 != 0 {
+                        // xor result with `left` shifted to the left by i positions
+                        result ^= left << i;
+                    }
+                }
+
+                // Only return the lower bits.
+                result &= u128::MAX >> (128 - size.bits());
+
+                self.write_scalar(Scalar::from_uint(result, dest.layout.size), dest)?;
+            }
 
             // Unsupported intrinsic: skip the return_to_block below.
             _ => return interp_ok(false),
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index 1821497640251..7c6dbfdb7ab70 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -2187,8 +2187,9 @@ pub const unsafe fn unchecked_funnel_shr<T: [const] fallback::FunnelShift>(
 #[rustc_nounwind]
 #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
 #[unstable(feature = "uint_carryless_mul", issue = "152080")]
-#[miri::intrinsic_fallback_is_spec]
 pub const fn carryless_mul<T: [const] fallback::CarrylessMul>(a: T, b: T) -> T {
+    // NOTE: while this implementation could serve as the specification, rustc_const_eval
+    // actually implements a simpler but less efficient variant as the specification.
     a.carryless_mul(b)
 }
 

From fc1448f0d574fc015a32041fb92049cacc061a7f Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 5 Feb 2026 22:17:34 +0100
Subject: [PATCH 03/11] add a custom fallback for u128 (the generic version
 would run into carries)

---
 library/core/src/intrinsics/fallback.rs | 57 ++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs
index a9b610ae2cbc1..4f4e125a0ae4e 100644
--- a/library/core/src/intrinsics/fallback.rs
+++ b/library/core/src/intrinsics/fallback.rs
@@ -238,7 +238,7 @@ macro_rules! impl_carryless_mul{
                 use crate::num::Wrapping;
 
                 // i.e. 0b100010001...0001 in binary.
-                const MASK: u128 = 0x1111_1111_1111_1111_1111_1111_1111_1111;
+                const MASK: u64 = 0x1111_1111_1111_1111;
 
                 let m0 = MASK as $type;
                 let x = self;
@@ -270,5 +270,58 @@ macro_rules! impl_carryless_mul{
 }
 
 impl_carryless_mul! {
-    u8, u16, u32, u64, u128, usize
+    u8, u16, u32, u64, usize
+}
+
+#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
+impl const CarrylessMul for u128 {
+    #[inline]
+    fn carryless_mul(self, rhs: Self) -> Self {
+        // For u128 the 0b100010001...0001 trick above does not work, so we use an implementation
+        // that uses 64-bit carryless multiplication.
+        karatsuba_u128(self, rhs).1
+    }
+}
+
+#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
+const fn karatsuba_u128(h: u128, y: u128) -> (u128, u128) {
+    // Karatsuba input decomposition for H
+    let (h1, h0) = ((h >> 64) as u64, h as u64);
+    let h0r = h0.reverse_bits();
+    let h1r = h1.reverse_bits();
+    let h2 = h0 ^ h1;
+    let h2r = h0r ^ h1r;
+
+    // Karatsuba input decomposition for Y
+    let (y1, y0) = ((y >> 64) as u64, y as u64);
+    let y0r = y0.reverse_bits();
+    let y1r = y1.reverse_bits();
+    let y2 = y0 ^ y1;
+    let y2r = y0r ^ y1r;
+
+    // Perform carryless multiplications
+    let z0 = y0.carryless_mul(h0);
+    let z1 = y1.carryless_mul(h1);
+    let mut z2 = y2.carryless_mul(h2);
+    let mut z0h = y0r.carryless_mul(h0r);
+    let mut z1h = y1r.carryless_mul(h1r);
+    let mut z2h = y2r.carryless_mul(h2r);
+
+    // Karatsuba recombination
+    z2 ^= z0 ^ z1;
+    z2h ^= z0h ^ z1h;
+    z0h = z0h.reverse_bits() >> 1;
+    z1h = z1h.reverse_bits() >> 1;
+    z2h = z2h.reverse_bits() >> 1;
+
+    // Assemble the final 256-bit product
+    let v0 = z0;
+    let v1 = z0h ^ z2;
+    let v2 = z1 ^ z2h;
+    let v3 = z1h;
+
+    let high = ((v3 as u128) << 64) | v2 as u128;
+    let low = ((v1 as u128) << 64) | v0 as u128;
+
+    (high, low)
 }

From 3cef0de57a67dc7401e28407d2f62bf16eec7b78 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 00:01:07 +0100
Subject: [PATCH 04/11] use mask constants in the generic impl

---
 library/core/src/intrinsics/fallback.rs | 30 ++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs
index 4f4e125a0ae4e..19be1e5d2a2f3 100644
--- a/library/core/src/intrinsics/fallback.rs
+++ b/library/core/src/intrinsics/fallback.rs
@@ -238,32 +238,32 @@ macro_rules! impl_carryless_mul{
                 use crate::num::Wrapping;
 
                 // i.e. 0b100010001...0001 in binary.
-                const MASK: u64 = 0x1111_1111_1111_1111;
+                const MASK: u64 = 0x1111_1111_1111_1111u64;
+
+                const M0: $type = MASK as $type;
+                const M1: $type = M0 << 1;
+                const M2: $type = M1 << 1;
+                const M3: $type = M2 << 1;
 
-                let m0 = MASK as $type;
                 let x = self;
                 let y = rhs;
 
-                let m1 = m0 << 1;
-                let m2 = m1 << 1;
-                let m3 = m2 << 1;
-
-                let x0 = Wrapping(x & m0);
-                let x1 = Wrapping(x & m1);
-                let x2 = Wrapping(x & m2);
-                let x3 = Wrapping(x & m3);
+                let x0 = Wrapping(x & M0);
+                let x1 = Wrapping(x & M1);
+                let x2 = Wrapping(x & M2);
+                let x3 = Wrapping(x & M3);
 
-                let y0 = Wrapping(y & m0);
-                let y1 = Wrapping(y & m1);
-                let y2 = Wrapping(y & m2);
-                let y3 = Wrapping(y & m3);
+                let y0 = Wrapping(y & M0);
+                let y1 = Wrapping(y & M1);
+                let y2 = Wrapping(y & M2);
+                let y3 = Wrapping(y & M3);
 
                 let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
                 let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
                 let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
                 let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
 
-                (z0.0 & m0) | (z1.0 & m1) | (z2.0 & m2) | (z3.0 & m3)
+                (z0.0 & M0) | (z1.0 & M1) | (z2.0 & M2) | (z3.0 & M3)
             }
         }
     )*};

From 2220317fe0d5265dedf950f675c68c5b08d33f5a Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 00:01:15 +0100
Subject: [PATCH 05/11] update docs

---
 library/core/src/num/uint_macros.rs | 40 +++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index 3d045fb913edc..30b340cba8fd3 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -485,13 +485,43 @@ macro_rules! uint_impl {
             unsafe { intrinsics::unchecked_funnel_shr(self, rhs, n) }
         }
 
-        /// Performs a carry-less multiplication.
+        /// Performs a carry-less multiplication, returning the lower bits.
+        ///
+        /// This operation is similar to long multiplication, except that exclusive or is used
+        /// instead of addition. The implementation is equivalent to:
+        ///
+        /// ```no_run
+        #[doc = concat!("pub fn carryless_mul(lhs: ", stringify!($SelfT), ", rhs: ", stringify!($SelfT), ") -> ", stringify!($SelfT), "{")]
+        ///     let mut retval = 0;
+        #[doc = concat!("    for i in 0..",  stringify!($SelfT), "::BITS {")]
+        ///         if (rhs >> i) & 1 != 0 {
+        ///             // long multiplication would use +=
+        ///             retval ^= lhs << i;
+        ///         }
+        ///     }
+        ///     retval
+        /// }
+        /// ```
+        ///
+        /// The actual implementation is more efficient, and on some plaforms lowers directly to a
+        /// dedicated instruction.
         ///
-        /// This is similar to long multiplication except that the carry is discarded.
-        /// This function wraps, so only the low bits are returned.
+        /// # Uses
         ///
-        /// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial
-        /// ring over `GF(2)`.
+        /// Carryless multiplication can be used to turn a bitmask of quote characters into a
+        /// bit mask of characters surrounded by quotes:
+        ///
+        /// ```no_run
+        /// r#"abc xxx "foobar" zzz "a"!"#; // input string
+        ///  0b0000000010000001000001010; // quote_mask
+        ///  0b0000000001111110000000100; // quote_mask.carryless_mul(!0) & !quote_mask
+        /// ```
+        ///
+        /// Another use is in cryptography, where carryless multiplication allows for efficient
+        /// implementations of polynomial multiplication in `GF(2)[X]`, the polynomial ring
+        /// over `GF(2)`.
+        ///
+        /// # Examples
         ///
         /// ```
         /// #![feature(uint_carryless_mul)]

From ecf4d3fa8bb6de6f7c95d76b5f17cb2424f61467 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 00:28:38 +0100
Subject: [PATCH 06/11] use the fallback with earlier versions of LLVM

---
 compiler/rustc_codegen_llvm/src/intrinsic.rs | 27 +++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 570e9602a60ce..0ac068787ff2a 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -387,6 +387,27 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 let pair = self.insert_value(pair, high, 1);
                 pair
             }
+
+            // FIXME move into the branch below when LLVM 22 is the lowest version we support.
+            sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
+                let ty = args[0].layout.ty;
+                if !ty.is_integral() {
+                    tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
+                        span,
+                        name,
+                        ty,
+                    });
+                    return Ok(());
+                }
+                let (size, _) = ty.int_size_and_signed(self.tcx);
+                let width = size.bits();
+                let llty = self.type_ix(width);
+
+                let lhs = args[0].immediate();
+                let rhs = args[1].immediate();
+                self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs])
+            }
+
             sym::ctlz
             | sym::ctlz_nonzero
             | sym::cttz
@@ -396,7 +417,6 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             | sym::bitreverse
             | sym::saturating_add
             | sym::saturating_sub
-            | sym::carryless_mul
             | sym::unchecked_funnel_shl
             | sym::unchecked_funnel_shr => {
                 let ty = args[0].layout.ty;
@@ -439,11 +459,6 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                     sym::bitreverse => {
                         self.call_intrinsic("llvm.bitreverse", &[llty], &[args[0].immediate()])
                     }
-                    sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
-                        let lhs = args[0].immediate();
-                        let rhs = args[1].immediate();
-                        self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs])
-                    }
                     sym::unchecked_funnel_shl | sym::unchecked_funnel_shr => {
                         let is_left = name == sym::unchecked_funnel_shl;
                         let lhs = args[0].immediate();

From 16a0add2b1ca0db1c725e2c18f11e7024fb94113 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 00:29:23 +0100
Subject: [PATCH 07/11] add `carryless_mul` to `replaced_intrinsics`

---
 compiler/rustc_codegen_llvm/src/lib.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index bf3ec1f393302..eea4dfc08b7c7 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -345,7 +345,14 @@ impl CodegenBackend for LlvmCodegenBackend {
     }
 
     fn replaced_intrinsics(&self) -> Vec<Symbol> {
-        vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add]
+        let mut will_not_use_fallback =
+            vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add];
+
+        if llvm_util::get_version() >= (22, 0, 0) {
+            will_not_use_fallback.push(sym::carryless_mul);
+        }
+
+        will_not_use_fallback
     }
 
     fn codegen_crate<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Box<dyn Any> {

From fbc7f99d62f493928f5b6bbc22ed4c144d2d16d6 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 10:57:55 +0100
Subject: [PATCH 08/11] add improved u128 implementation

---
 library/core/src/intrinsics/fallback.rs | 69 +++++++++++--------------
 library/core/src/num/uint_macros.rs     |  2 +-
 2 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/library/core/src/intrinsics/fallback.rs b/library/core/src/intrinsics/fallback.rs
index 19be1e5d2a2f3..3244ccb8989be 100644
--- a/library/core/src/intrinsics/fallback.rs
+++ b/library/core/src/intrinsics/fallback.rs
@@ -277,51 +277,42 @@ impl_carryless_mul! {
 impl const CarrylessMul for u128 {
     #[inline]
     fn carryless_mul(self, rhs: Self) -> Self {
-        // For u128 the 0b100010001...0001 trick above does not work, so we use an implementation
-        // that uses 64-bit carryless multiplication.
-        karatsuba_u128(self, rhs).1
+        let l = u64::carryless_mul(self as u64, rhs as u64);
+        let lh = u64::carryless_mul(self as u64, (rhs >> 64) as u64);
+        let hl = u64::carryless_mul((self >> 64) as u64, rhs as u64);
+        let h = lh ^ hl ^ carryless_mul_high(self as u64, rhs as u64);
+        ((h as u128) << 64) | l as u128
     }
 }
 
 #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
-const fn karatsuba_u128(h: u128, y: u128) -> (u128, u128) {
-    // Karatsuba input decomposition for H
-    let (h1, h0) = ((h >> 64) as u64, h as u64);
-    let h0r = h0.reverse_bits();
-    let h1r = h1.reverse_bits();
-    let h2 = h0 ^ h1;
-    let h2r = h0r ^ h1r;
-
-    // Karatsuba input decomposition for Y
-    let (y1, y0) = ((y >> 64) as u64, y as u64);
-    let y0r = y0.reverse_bits();
-    let y1r = y1.reverse_bits();
-    let y2 = y0 ^ y1;
-    let y2r = y0r ^ y1r;
-
-    // Perform carryless multiplications
-    let z0 = y0.carryless_mul(h0);
-    let z1 = y1.carryless_mul(h1);
-    let mut z2 = y2.carryless_mul(h2);
-    let mut z0h = y0r.carryless_mul(h0r);
-    let mut z1h = y1r.carryless_mul(h1r);
-    let mut z2h = y2r.carryless_mul(h2r);
+#[inline]
+const fn carryless_mul_high(x: u64, y: u64) -> u64 {
+    // i.e. 0b100010001...0001 in binary.
+    const MASK: u64 = 0x1111_1111_1111_1111u64;
 
-    // Karatsuba recombination
-    z2 ^= z0 ^ z1;
-    z2h ^= z0h ^ z1h;
-    z0h = z0h.reverse_bits() >> 1;
-    z1h = z1h.reverse_bits() >> 1;
-    z2h = z2h.reverse_bits() >> 1;
+    const M0: u64 = MASK;
+    const M1: u64 = M0 << 1;
+    const M2: u64 = M1 << 1;
+    const M3: u64 = M2 << 1;
 
-    // Assemble the final 256-bit product
-    let v0 = z0;
-    let v1 = z0h ^ z2;
-    let v2 = z1 ^ z2h;
-    let v3 = z1h;
+    macro_rules! mul {
+        ($x_mask_shift:literal, $y_mask_shift:literal) => {{
+            let x = x & (MASK << $x_mask_shift);
+            let y = y & (MASK << $y_mask_shift);
+            crate::hint::select_unpredictable(
+                x == MASK << $x_mask_shift && y == MASK << $y_mask_shift,
+                // only case where the multiply overflows the 4-bit parts
+                0x0101_0101_0101_0101u64 << ($x_mask_shift + $y_mask_shift),
+                x.carrying_mul(y, 0).1,
+            )
+        }};
+    }
 
-    let high = ((v3 as u128) << 64) | v2 as u128;
-    let low = ((v1 as u128) << 64) | v0 as u128;
+    let z0 = mul!(0, 0) ^ mul!(1, 3) ^ mul!(2, 2) ^ mul!(3, 1);
+    let z1 = mul!(0, 1) ^ mul!(1, 0) ^ mul!(2, 3) ^ mul!(3, 2);
+    let z2 = mul!(0, 2) ^ mul!(1, 1) ^ mul!(2, 0) ^ mul!(3, 3);
+    let z3 = mul!(0, 3) ^ mul!(1, 2) ^ mul!(2, 1) ^ mul!(3, 0);
 
-    (high, low)
+    (z0 & M0) | (z1 & M1) | (z2 & M2) | (z3 & M3)
 }
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index 30b340cba8fd3..62c0e3e317da4 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -503,7 +503,7 @@ macro_rules! uint_impl {
         /// }
         /// ```
         ///
-        /// The actual implementation is more efficient, and on some plaforms lowers directly to a
+        /// The actual implementation is more efficient, and on some platforms lowers directly to a
         /// dedicated instruction.
         ///
         /// # Uses

From 7404f05e5b173cccce40b19a6b91857f2b5d5e81 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 16:43:50 +0100
Subject: [PATCH 09/11] add `widening_carryless_mul`

---
 library/core/src/num/mod.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs
index aa3a3e5d3ca41..7a716b79d4a5b 100644
--- a/library/core/src/num/mod.rs
+++ b/library/core/src/num/mod.rs
@@ -244,6 +244,30 @@ macro_rules! midpoint_impl {
     };
 }
 
+macro_rules! widening_carryless_mul_impl {
+    ($SelfT:ty, $WideT:ty) => {
+        /// Performs a widening carry-less multiplication.
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// #![feature(uint_carryless_mul)]
+        ///
+        #[doc = concat!("assert_eq!(", stringify!($SelfT), "::MAX.widening_carryless_mul(",
+                                stringify!($SelfT), "::MAX), ", stringify!($WideT), "::MAX / 3);")]
+        /// ```
+        #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[doc(alias = "clmul")]
+        #[unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[must_use = "this returns the result of the operation, \
+                      without modifying the original"]
+        #[inline]
+        pub const fn widening_carryless_mul(self, rhs: $SelfT) -> $WideT {
+            (self as $WideT).carryless_mul(rhs as $WideT)
+        }
+    };
+}
+
 impl i8 {
     int_impl! {
         Self = i8,
@@ -471,6 +495,7 @@ impl u8 {
         bound_condition = "",
     }
     midpoint_impl! { u8, u16, unsigned }
+    widening_carryless_mul_impl! { u8, u16 }
 
     /// Checks if the value is within the ASCII range.
     ///
@@ -1111,6 +1136,7 @@ impl u16 {
         bound_condition = "",
     }
     midpoint_impl! { u16, u32, unsigned }
+    widening_carryless_mul_impl! { u16, u32 }
 
     /// Checks if the value is a Unicode surrogate code point, which are disallowed values for [`char`].
     ///
@@ -1164,6 +1190,7 @@ impl u32 {
         bound_condition = "",
     }
     midpoint_impl! { u32, u64, unsigned }
+    widening_carryless_mul_impl! { u32, u64 }
 }
 
 impl u64 {
@@ -1193,6 +1220,7 @@ impl u64 {
         bound_condition = "",
     }
     midpoint_impl! { u64, u128, unsigned }
+    widening_carryless_mul_impl! { u64, u128 }
 }
 
 impl u128 {

From 0e2315c1ab8f9400e2ab324d84e136a035b1ded9 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 6 Feb 2026 21:33:04 +0100
Subject: [PATCH 10/11] add `carrying_carryless_mul`

---
 library/core/src/num/mod.rs | 97 +++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs
index 7a716b79d4a5b..6771130472d62 100644
--- a/library/core/src/num/mod.rs
+++ b/library/core/src/num/mod.rs
@@ -265,7 +265,96 @@ macro_rules! widening_carryless_mul_impl {
         pub const fn widening_carryless_mul(self, rhs: $SelfT) -> $WideT {
             (self as $WideT).carryless_mul(rhs as $WideT)
         }
+    }
+}
+
+macro_rules! carrying_carryless_mul_impl {
+    (u128, u256) => {
+        carrying_carryless_mul_impl! { @internal u128 =>
+            pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) {
+                // Karatsuba input decomposition for H
+                let (h1, h0) = ((self >> 64) as u64, self as u64);
+                let h0r = h0.reverse_bits();
+                let h1r = h1.reverse_bits();
+                let h2 = h0 ^ h1;
+                let h2r = h0r ^ h1r;
+
+                // Karatsuba input decomposition for Y
+                let (y1, y0) = ((rhs >> 64) as u64, rhs as u64);
+                let y0r = y0.reverse_bits();
+                let y1r = y1.reverse_bits();
+                let y2 = y0 ^ y1;
+                let y2r = y0r ^ y1r;
+
+                // Perform carryless multiplications
+                let z0 = y0.carryless_mul(h0);
+                let z1 = y1.carryless_mul(h1);
+                let mut z2 = y2.carryless_mul(h2);
+                let mut z0h = y0r.carryless_mul(h0r);
+                let mut z1h = y1r.carryless_mul(h1r);
+                let mut z2h = y2r.carryless_mul(h2r);
+
+                // Karatsuba recombination
+                z2 ^= z0 ^ z1;
+                z2h ^= z0h ^ z1h;
+                z0h = z0h.reverse_bits() >> 1;
+                z1h = z1h.reverse_bits() >> 1;
+                z2h = z2h.reverse_bits() >> 1;
+
+                // Assemble the final 256-bit product
+                let v0 = z0;
+                let v1 = z0h ^ z2;
+                let v2 = z1 ^ z2h;
+                let v3 = z1h;
+
+                let hi = ((v3 as u128) << 64) | v2 as u128;
+                let lo = ((v1 as u128) << 64) | v0 as u128;
+
+                (lo ^ carry, hi)
+            }
+        }
+    };
+    ($SelfT:ty, $WideT:ty) => {
+        carrying_carryless_mul_impl! { @internal $SelfT =>
+            pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) {
+                // Can't use widening_carryless_mul because it's not implemented for usize.
+                let p = (self as $WideT).carryless_mul(rhs as $WideT);
+
+                let lo = (p as $SelfT);
+                let hi = (p  >> Self::BITS) as $SelfT;
+
+                (lo ^ carry, hi)
+            }
+        }
     };
+    (@internal $SelfT:ty => $($fn:tt)*) => {
+        /// Calculates the "full carryless multiplication" without the possibility to overflow.
+        ///
+        /// This returns the low-order (wrapping) bits and the high-order (overflow) bits
+        /// of the result as two separate values, in that order.
+        ///
+        /// # Examples
+        ///
+        /// Please note that this example is shared among integer types, which is why `u8` is used.
+        ///
+        /// ```
+        /// #![feature(uint_carryless_mul)]
+        ///
+        /// assert_eq!(0b1000_0000u8.carrying_carryless_mul(0b1000_0000, 0b0000), (0, 0b0100_0000));
+        /// assert_eq!(0b1000_0000u8.carrying_carryless_mul(0b1000_0000, 0b1111), (0b1111, 0b0100_0000));
+        #[doc = concat!("assert_eq!(",
+            stringify!($SelfT), "::MAX.carrying_carryless_mul(", stringify!($SelfT), "::MAX, ", stringify!($SelfT), "::MAX), ",
+            "(!(", stringify!($SelfT), "::MAX / 3), ", stringify!($SelfT), "::MAX / 3));"
+        )]
+        /// ```
+        #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[doc(alias = "clmul")]
+        #[unstable(feature = "uint_carryless_mul", issue = "152080")]
+        #[must_use = "this returns the result of the operation, \
+                      without modifying the original"]
+        #[inline]
+        $($fn)*
+    }
 }
 
 impl i8 {
@@ -496,6 +585,7 @@ impl u8 {
     }
     midpoint_impl! { u8, u16, unsigned }
     widening_carryless_mul_impl! { u8, u16 }
+    carrying_carryless_mul_impl! { u8, u16 }
 
     /// Checks if the value is within the ASCII range.
     ///
@@ -1137,6 +1227,7 @@ impl u16 {
     }
     midpoint_impl! { u16, u32, unsigned }
     widening_carryless_mul_impl! { u16, u32 }
+    carrying_carryless_mul_impl! { u16, u32 }
 
     /// Checks if the value is a Unicode surrogate code point, which are disallowed values for [`char`].
     ///
@@ -1191,6 +1282,7 @@ impl u32 {
     }
     midpoint_impl! { u32, u64, unsigned }
     widening_carryless_mul_impl! { u32, u64 }
+    carrying_carryless_mul_impl! { u32, u64 }
 }
 
 impl u64 {
@@ -1221,6 +1313,7 @@ impl u64 {
     }
     midpoint_impl! { u64, u128, unsigned }
     widening_carryless_mul_impl! { u64, u128 }
+    carrying_carryless_mul_impl! { u64, u128 }
 }
 
 impl u128 {
@@ -1252,6 +1345,7 @@ impl u128 {
         bound_condition = "",
     }
     midpoint_impl! { u128, unsigned }
+    carrying_carryless_mul_impl! { u128, u256 }
 }
 
 #[cfg(target_pointer_width = "16")]
@@ -1282,6 +1376,7 @@ impl usize {
         bound_condition = " on 16-bit targets",
     }
     midpoint_impl! { usize, u32, unsigned }
+    carrying_carryless_mul_impl! { usize, u32 }
 }
 
 #[cfg(target_pointer_width = "32")]
@@ -1312,6 +1407,7 @@ impl usize {
         bound_condition = " on 32-bit targets",
     }
     midpoint_impl! { usize, u64, unsigned }
+    carrying_carryless_mul_impl! { usize, u64 }
 }
 
 #[cfg(target_pointer_width = "64")]
@@ -1342,6 +1438,7 @@ impl usize {
         bound_condition = " on 64-bit targets",
     }
     midpoint_impl! { usize, u128, unsigned }
+    carrying_carryless_mul_impl! { usize, u128 }
 }
 
 impl usize {

From 59d6fa7938a7b9f8efe588e47520c54ec2ce20f4 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Sun, 8 Feb 2026 11:48:57 +0100
Subject: [PATCH 11/11] improve 128-bit `carrying_carryless_mul`

---
 library/core/src/num/mod.rs | 59 ++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs
index 6771130472d62..839a6fbdc9b7e 100644
--- a/library/core/src/num/mod.rs
+++ b/library/core/src/num/mod.rs
@@ -272,43 +272,28 @@ macro_rules! carrying_carryless_mul_impl {
     (u128, u256) => {
         carrying_carryless_mul_impl! { @internal u128 =>
             pub const fn carrying_carryless_mul(self, rhs: Self, carry: Self) -> (Self, Self) {
-                // Karatsuba input decomposition for H
-                let (h1, h0) = ((self >> 64) as u64, self as u64);
-                let h0r = h0.reverse_bits();
-                let h1r = h1.reverse_bits();
-                let h2 = h0 ^ h1;
-                let h2r = h0r ^ h1r;
-
-                // Karatsuba input decomposition for Y
-                let (y1, y0) = ((rhs >> 64) as u64, rhs as u64);
-                let y0r = y0.reverse_bits();
-                let y1r = y1.reverse_bits();
-                let y2 = y0 ^ y1;
-                let y2r = y0r ^ y1r;
-
-                // Perform carryless multiplications
-                let z0 = y0.carryless_mul(h0);
-                let z1 = y1.carryless_mul(h1);
-                let mut z2 = y2.carryless_mul(h2);
-                let mut z0h = y0r.carryless_mul(h0r);
-                let mut z1h = y1r.carryless_mul(h1r);
-                let mut z2h = y2r.carryless_mul(h2r);
-
-                // Karatsuba recombination
-                z2 ^= z0 ^ z1;
-                z2h ^= z0h ^ z1h;
-                z0h = z0h.reverse_bits() >> 1;
-                z1h = z1h.reverse_bits() >> 1;
-                z2h = z2h.reverse_bits() >> 1;
-
-                // Assemble the final 256-bit product
-                let v0 = z0;
-                let v1 = z0h ^ z2;
-                let v2 = z1 ^ z2h;
-                let v3 = z1h;
-
-                let hi = ((v3 as u128) << 64) | v2 as u128;
-                let lo = ((v1 as u128) << 64) | v0 as u128;
+                let x0 = self as u64;
+                let x1 = (self >> 64) as u64;
+                let y0 = rhs as u64;
+                let y1 = (rhs >> 64) as u64;
+
+                let z0 = u64::widening_carryless_mul(x0, y0);
+                let z2 = u64::widening_carryless_mul(x1, y1);
+
+                // The grade school algorithm would compute:
+                // z1 = x0y1 ^ x1y0
+
+                // Instead, Karatsuba first computes:
+                let z3 = u64::widening_carryless_mul(x0 ^ x1, y0 ^ y1);
+                // Since it distributes over XOR,
+                // z3 == x0y0 ^ x0y1 ^ x1y0 ^ x1y1
+                //       |--|   |---------|   |--|
+                //    ==  z0  ^     z1      ^  z2
+                // so we can compute z1 as
+                let z1 = z3 ^ z0 ^ z2;
+
+                let lo = z0 ^ (z1 << 64);
+                let hi = z2 ^ (z1 >> 64);
 
                 (lo ^ carry, hi)
             }