Skip to content

Commit 98a877b

Browse files
committed
Implement much faster SIMD MD5 alternative
1 parent 09901f3 commit 98a877b

File tree

6 files changed

+252
-91
lines changed

6 files changed

+252
-91
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ ptr_as_ptr = "warn"
199199
ptr_cast_constness = "warn"
200200
pub_underscore_fields = "warn"
201201
pub_use = "warn"
202-
pub_with_shorthand = "warn"
202+
pub_with_shorthand = "allow"
203203
pub_without_shorthand = "warn"
204204
question_mark_used = "allow"
205205
range_minus_one = "warn"

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
306306
| 2 | [Bathroom Security](https://adventofcode.com/2016/day/2) | [Source](src/year2016/day02.rs) | 9 |
307307
| 3 | [Squares With Three Sides](https://adventofcode.com/2016/day/3) | [Source](src/year2016/day03.rs) | 26 |
308308
| 4 | [Security Through Obscurity](https://adventofcode.com/2016/day/4) | [Source](src/year2016/day04.rs) | 75 |
309-
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 212000 |
309+
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 37000 |
310310
| 6 | [Signals and Noise](https://adventofcode.com/2016/day/6) | [Source](src/year2016/day06.rs) | 5 |
311311
| 7 | [Internet Protocol Version 7](https://adventofcode.com/2016/day/7) | [Source](src/year2016/day07.rs) | 354 |
312312
| 8 | [Two-Factor Authentication](https://adventofcode.com/2016/day/8) | [Source](src/year2016/day08.rs) | 10 |
@@ -315,7 +315,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
315315
| 11 | [Radioisotope Thermoelectric Generators](https://adventofcode.com/2016/day/11) | [Source](src/year2016/day11.rs) | 785 |
316316
| 12 | [Leonardo's Monorail](https://adventofcode.com/2016/day/12) | [Source](src/year2016/day12.rs) | 1 |
317317
| 13 | [A Maze of Twisty Little Cubicles](https://adventofcode.com/2016/day/13) | [Source](src/year2016/day13.rs) | 4 |
318-
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 434000 |
318+
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 79000 |
319319
| 15 | [Timing is Everything](https://adventofcode.com/2016/day/15) | [Source](src/year2016/day15.rs) | 1 |
320320
| 16 | [Dragon Checksum](https://adventofcode.com/2016/day/16) | [Source](src/year2016/day16.rs) | 1 |
321321
| 17 | [Two Steps Forward](https://adventofcode.com/2016/day/17) | [Source](src/year2016/day17.rs) | 14254 |
@@ -337,7 +337,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
337337
| 1 | [Not Quite Lisp](https://adventofcode.com/2015/day/1) | [Source](src/year2015/day01.rs) | 2 |
338338
| 2 | [I Was Told There Would Be No Math](https://adventofcode.com/2015/day/2) | [Source](src/year2015/day02.rs) | 8 |
339339
| 3 | [Perfectly Spherical Houses in a Vacuum](https://adventofcode.com/2015/day/3) | [Source](src/year2015/day03.rs) | 100 |
340-
| 4 | [The Ideal Stocking Stuffer](https://adventofcode.com/2015/day/4) | [Source](src/year2015/day04.rs) | 76000 |
340+
| 4 | [The Ideal Stocking Stuffer](https://adventofcode.com/2015/day/4) | [Source](src/year2015/day04.rs) | 13000 |
341341
| 5 | [Doesn't He Have Intern-Elves For This?](https://adventofcode.com/2015/day/5) | [Source](src/year2015/day05.rs) | 39 |
342342
| 6 | [Probably a Fire Hazard](https://adventofcode.com/2015/day/6) | [Source](src/year2015/day06.rs) | 5780 |
343343
| 7 | [Some Assembly Required](https://adventofcode.com/2015/day/7) | [Source](src/year2015/day07.rs) | 27 |

src/year2015/day04.rs

+55
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ pub fn parse(input: &str) -> Shared {
4747
// Use as many cores as possible to parallelize the remaining search.
4848
thread::scope(|scope| {
4949
for _ in 0..thread::available_parallelism().unwrap().get() {
50+
#[cfg(not(feature = "simd"))]
5051
scope.spawn(|| worker(&shared));
52+
#[cfg(feature = "simd")]
53+
scope.spawn(|| simd::worker(&shared));
5154
}
5255
});
5356

@@ -83,6 +86,7 @@ fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
8386
}
8487
}
8588

89+
#[cfg(not(feature = "simd"))]
8690
fn worker(shared: &Shared) {
8791
while !shared.done.load(Ordering::Relaxed) {
8892
let offset = shared.counter.fetch_add(1000, Ordering::Relaxed);
@@ -98,3 +102,54 @@ fn worker(shared: &Shared) {
98102
}
99103
}
100104
}
105+
106+
#[cfg(feature = "simd")]
107+
mod simd {
108+
use super::*;
109+
use crate::util::md5::simd::hash;
110+
use std::simd::{LaneCount, SupportedLaneCount};
111+
112+
#[allow(clippy::needless_range_loop)]
113+
fn check_hash_simd<const N: usize>(
114+
buffers: &mut [[u8; 64]],
115+
size: usize,
116+
start: u32,
117+
offset: u32,
118+
shared: &Shared,
119+
) where
120+
LaneCount<N>: SupportedLaneCount,
121+
{
122+
// Format macro is very slow, so update digits directly
123+
for i in 0..N {
124+
let n = offset + i as u32;
125+
buffers[i][size - 3] = b'0' + (n / 100) as u8;
126+
buffers[i][size - 2] = b'0' + ((n / 10) % 10) as u8;
127+
buffers[i][size - 1] = b'0' + (n % 10) as u8;
128+
}
129+
130+
let (result, ..) = hash::<N>(buffers, size);
131+
132+
for i in 0..N {
133+
if result[i] & 0xffffff00 == 0 {
134+
shared.second.fetch_min(start + offset + i as u32, Ordering::Relaxed);
135+
shared.done.store(true, Ordering::Relaxed);
136+
} else if result[i] & 0xfffff000 == 0 {
137+
shared.first.fetch_min(start + offset + i as u32, Ordering::Relaxed);
138+
}
139+
}
140+
}
141+
142+
pub(super) fn worker(shared: &Shared) {
143+
while !shared.done.load(Ordering::Relaxed) {
144+
let start = shared.counter.fetch_add(1000, Ordering::Relaxed);
145+
let (prefix, size) = format_string(&shared.prefix, start);
146+
let mut buffers = [prefix; 32];
147+
148+
for offset in (0..992).step_by(32) {
149+
check_hash_simd::<32>(&mut buffers, size, start, offset, shared);
150+
}
151+
152+
check_hash_simd::<8>(&mut buffers, size, start, 992, shared);
153+
}
154+
}
155+
}

src/year2016/day05.rs

+60
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ pub fn parse(input: &str) -> Vec<u32> {
3737
// Use as many cores as possible to parallelize the remaining search.
3838
thread::scope(|scope| {
3939
for _ in 0..thread::available_parallelism().unwrap().get() {
40+
#[cfg(not(feature = "simd"))]
4041
scope.spawn(|| worker(&shared, &mutex));
42+
#[cfg(feature = "simd")]
43+
scope.spawn(|| simd::worker(&shared, &mutex));
4144
}
4245
});
4346

@@ -93,6 +96,7 @@ fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared, mutex: &M
9396
}
9497
}
9598

99+
#[cfg(not(feature = "simd"))]
96100
fn worker(shared: &Shared, mutex: &Mutex<Exclusive>) {
97101
while !shared.done.load(Ordering::Relaxed) {
98102
let offset = shared.counter.fetch_add(1000, Ordering::Relaxed);
@@ -108,3 +112,59 @@ fn worker(shared: &Shared, mutex: &Mutex<Exclusive>) {
108112
}
109113
}
110114
}
115+
116+
#[cfg(feature = "simd")]
117+
mod simd {
118+
use super::*;
119+
use crate::util::md5::simd::hash;
120+
use std::simd::{LaneCount, SupportedLaneCount};
121+
122+
#[allow(clippy::needless_range_loop)]
123+
fn check_hash_simd<const N: usize>(
124+
buffers: &mut [[u8; 64]],
125+
size: usize,
126+
start: u32,
127+
offset: u32,
128+
shared: &Shared,
129+
mutex: &Mutex<Exclusive>,
130+
) where
131+
LaneCount<N>: SupportedLaneCount,
132+
{
133+
// Format macro is very slow, so update digits directly
134+
for i in 0..N {
135+
let n = offset + i as u32;
136+
buffers[i][size - 3] = b'0' + (n / 100) as u8;
137+
buffers[i][size - 2] = b'0' + ((n / 10) % 10) as u8;
138+
buffers[i][size - 1] = b'0' + (n % 10) as u8;
139+
}
140+
141+
let (result, ..) = hash::<N>(buffers, size);
142+
143+
for i in 0..N {
144+
if result[i] & 0xfffff000 == 0 {
145+
let mut exclusive = mutex.lock().unwrap();
146+
147+
exclusive.found.push((start + offset + i as u32, result[i]));
148+
exclusive.mask |= 1 << (result[i] >> 8);
149+
150+
if exclusive.mask & 0xff == 0xff {
151+
shared.done.store(true, Ordering::Relaxed);
152+
}
153+
}
154+
}
155+
}
156+
157+
pub(super) fn worker(shared: &Shared, mutex: &Mutex<Exclusive>) {
158+
while !shared.done.load(Ordering::Relaxed) {
159+
let start = shared.counter.fetch_add(1000, Ordering::Relaxed);
160+
let (prefix, size) = format_string(&shared.prefix, start);
161+
let mut buffers = [prefix; 32];
162+
163+
for offset in (0..992).step_by(32) {
164+
check_hash_simd::<32>(&mut buffers, size, start, offset, shared, mutex);
165+
}
166+
167+
check_hash_simd::<8>(&mut buffers, size, start, 992, shared, mutex);
168+
}
169+
}
170+
}

0 commit comments

Comments
 (0)