-
-
Notifications
You must be signed in to change notification settings - Fork 14.5k
implement carryless_mul
#152132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
implement carryless_mul
#152132
Changes from all commits
7d30e87
b11c3d5
fc1448f
3cef0de
2220317
ecf4d3f
16a0add
fbc7f99
7404f05
0e2315c
59d6fa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -733,6 +733,33 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> { | |
| sym::fmuladdf128 => { | ||
| self.float_muladd_intrinsic::<Quad>(args, dest, MulAddType::Nondeterministic)? | ||
| } | ||
| sym::carryless_mul => { | ||
| let size = dest.layout.size; | ||
|
|
||
| let left = self.read_scalar(&args[0])?.to_bits(size)?; | ||
| let right = self.read_scalar(&args[1])?.to_bits(size)?; | ||
|
|
||
| // perform carry-less multiplication. | ||
| // | ||
| // this operation is like long multiplication, but ignores the carries. | ||
| // that idea corresponds to the xor operator, which is used in the implementation. | ||
| // | ||
| // wikipedia has an example https://en.wikipedia.org/wiki/carry-less_product#example | ||
| let mut result: u128 = 0; | ||
|
|
||
| for i in 0..size.bits() { | ||
| // if the i-th bit in right is set | ||
| if (right >> i) & 1 != 0 { | ||
| // xor result with `left` shifted to the left by i positions | ||
| result ^= left << i; | ||
| } | ||
| } | ||
|
Comment on lines
+750
to
+756
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please explain why it is okay to run this algorithm on
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no signed carryless-mul, if it exists on a signed type, it always returns the same results as the corresponding unsigned version (the definition using shifting assumes unsigned types where that matters). |
||
|
|
||
| // Only return the lower bits. | ||
| result &= u128::MAX >> (128 - size.bits()); | ||
|
Comment on lines
+758
to
+759
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use |
||
|
|
||
| self.write_scalar(Scalar::from_uint(result, dest.layout.size), dest)?; | ||
| } | ||
|
|
||
| // Unsupported intrinsic: skip the return_to_block below. | ||
| _ => return interp_ok(false), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -218,3 +218,101 @@ macro_rules! impl_funnel_shifts { | |
| impl_funnel_shifts! { | ||
| u8, u16, u32, u64, u128, usize | ||
| } | ||
|
|
||
| #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] | ||
| pub const trait CarrylessMul: Copy + 'static { | ||
| /// See [`super::carryless_mul`]; we just need the trait indirection to handle | ||
| /// different types since calling intrinsics with generics doesn't work. | ||
| fn carryless_mul(self, rhs: Self) -> Self; | ||
| } | ||
|
|
||
| macro_rules! impl_carryless_mul{ | ||
| ($($type:ident),*) => {$( | ||
| /// This approach uses a bitmask of the form `0b100010001...0001` to avoid carry spilling. | ||
| /// When carries do occur, they wind up in a "hole" of zeros and are subsequently masked | ||
| /// out of the result. | ||
|
Comment on lines
+231
to
+233
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This approach with 4-bit digits works up to integers with For The impl for some tests against a naive impl: playground
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a |
||
| #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] | ||
| impl const CarrylessMul for $type { | ||
| #[inline] | ||
| fn carryless_mul(self, rhs: Self) -> Self { | ||
| use crate::num::Wrapping; | ||
|
|
||
| // i.e. 0b100010001...0001 in binary. | ||
| const MASK: u64 = 0x1111_1111_1111_1111u64; | ||
|
|
||
| const M0: $type = MASK as $type; | ||
| const M1: $type = M0 << 1; | ||
| const M2: $type = M1 << 1; | ||
| const M3: $type = M2 << 1; | ||
|
|
||
| let x = self; | ||
| let y = rhs; | ||
|
|
||
| let x0 = Wrapping(x & M0); | ||
| let x1 = Wrapping(x & M1); | ||
| let x2 = Wrapping(x & M2); | ||
| let x3 = Wrapping(x & M3); | ||
|
|
||
| let y0 = Wrapping(y & M0); | ||
| let y1 = Wrapping(y & M1); | ||
| let y2 = Wrapping(y & M2); | ||
| let y3 = Wrapping(y & M3); | ||
|
|
||
| let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); | ||
| let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); | ||
| let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); | ||
| let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); | ||
|
|
||
| (z0.0 & M0) | (z1.0 & M1) | (z2.0 & M2) | (z3.0 & M3) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that Wikipedia and the relevant C++ paper (https://isocpp.org/files/papers/P3642R3.html) have much simpler implementations. We also provide a simpler implementation as the 'simple implementation' in the description of the public function (I think the same one C++ paper provides). I think this manual unrolling may help performance, but do we have evidence of that? Does that performance matter since in practice distros ~only ship stable toolchains and Rust will not stabilize this on LLVM < 22? Maybe we should use the simple implementation as fallback too?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm reading the implementation from that C++ paper correctly, it's doing one integer multiply for every bit of the input, whereas this does 16 multiply ops. For a The C++ example really seems more like a naive/idealized algorithm description designed to lower to optimized intrinsics as opposed to something you'd actually want to practically deploy as a portable implementation. Note: the original LLVM RFC originally proposed using the above method "If the CPU does not have a dedication clmul operation, it can be lowered to regular multiplication, by using holes to avoid carrys" but it sounds like the actual portable codegen out of LLVM isn't using it (yet)?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Technically yes, but note that each of those multiplies is of the form
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, I understand that this is batching that work. But given that this code is unreachable except on gcc + cranelift in Rust-distributed builds, it's not obvious to me how much that matters. I'd also maybe expect that llvm can optimize the naive form a bit closer to this? Not sure there.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's reachable on builds with older llvm
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, but it seems unlikely this will stabilize before those are largely phased out of our support? In any case, I think the main thing is adding more thorough test coverage.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we care enough about older LLVM to add non-trivial fast-paths for those versions? |
||
| } | ||
| } | ||
| )*}; | ||
| } | ||
|
|
||
| impl_carryless_mul! { | ||
| u8, u16, u32, u64, usize | ||
| } | ||
|
|
||
| #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] | ||
| impl const CarrylessMul for u128 { | ||
| #[inline] | ||
| fn carryless_mul(self, rhs: Self) -> Self { | ||
| let l = u64::carryless_mul(self as u64, rhs as u64); | ||
| let lh = u64::carryless_mul(self as u64, (rhs >> 64) as u64); | ||
| let hl = u64::carryless_mul((self >> 64) as u64, rhs as u64); | ||
| let h = lh ^ hl ^ carryless_mul_high(self as u64, rhs as u64); | ||
| ((h as u128) << 64) | l as u128 | ||
| } | ||
| } | ||
|
|
||
| #[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")] | ||
| #[inline] | ||
| const fn carryless_mul_high(x: u64, y: u64) -> u64 { | ||
| // i.e. 0b100010001...0001 in binary. | ||
| const MASK: u64 = 0x1111_1111_1111_1111u64; | ||
|
|
||
| const M0: u64 = MASK; | ||
| const M1: u64 = M0 << 1; | ||
| const M2: u64 = M1 << 1; | ||
| const M3: u64 = M2 << 1; | ||
|
|
||
| macro_rules! mul { | ||
| ($x_mask_shift:literal, $y_mask_shift:literal) => {{ | ||
| let x = x & (MASK << $x_mask_shift); | ||
| let y = y & (MASK << $y_mask_shift); | ||
| crate::hint::select_unpredictable( | ||
| x == MASK << $x_mask_shift && y == MASK << $y_mask_shift, | ||
| // only case where the multiply overflows the 4-bit parts | ||
| 0x0101_0101_0101_0101u64 << ($x_mask_shift + $y_mask_shift), | ||
| x.carrying_mul(y, 0).1, | ||
| ) | ||
| }}; | ||
| } | ||
|
|
||
| let z0 = mul!(0, 0) ^ mul!(1, 3) ^ mul!(2, 2) ^ mul!(3, 1); | ||
| let z1 = mul!(0, 1) ^ mul!(1, 0) ^ mul!(2, 3) ^ mul!(3, 2); | ||
| let z2 = mul!(0, 2) ^ mul!(1, 1) ^ mul!(2, 0) ^ mul!(3, 3); | ||
| let z3 = mul!(0, 3) ^ mul!(1, 2) ^ mul!(2, 1) ^ mul!(3, 0); | ||
|
|
||
| (z0 & M0) | (z1 & M1) | (z2 & M2) | (z3 & M3) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2179,6 +2179,20 @@ pub const unsafe fn unchecked_funnel_shr<T: [const] fallback::FunnelShift>( | |
| unsafe { a.unchecked_funnel_shr(b, shift) } | ||
| } | ||
|
|
||
| /// Carryless multiply. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this meant to be a self-contained description? To me these words mean nothing.^^ |
||
| /// | ||
| /// Safe versions of this intrinsic are available on the integer primitives | ||
| /// via the `carryless_mul` method. For example, [`u32::carryless_mul`]. | ||
| #[rustc_intrinsic] | ||
| #[rustc_nounwind] | ||
| #[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")] | ||
| #[unstable(feature = "uint_carryless_mul", issue = "152080")] | ||
| pub const fn carryless_mul<T: [const] fallback::CarrylessMul>(a: T, b: T) -> T { | ||
| // NOTE: while this implementation could serve as the specification, rustc_const_eval | ||
| // actually implements a simpler but less efficient variant as the specification. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? As I ask above, maybe our fallback could be the simple spec? |
||
| a.carryless_mul(b) | ||
| } | ||
|
|
||
| /// This is an implementation detail of [`crate::ptr::read`] and should | ||
| /// not be used anywhere else. See its comments for why this exists. | ||
| /// | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please use proper spelling and capitalization.