Faster work stealing iterator

maneatingape · maneatingape · commit 4a1d46137d04 · 2025-01-01T00:14:56.000Z
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ Place input files in `input/yearYYYY/dayDD.txt` including leading zeroes. For ex
 ## Performance
 
 Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
-All 250 solutions from 2024 to 2015 complete sequentially in **585 milliseconds**.
+All 250 solutions from 2024 to 2015 complete sequentially in **584 milliseconds**.
 Interestingly 84% of the total time is spent on just 9 solutions.
 Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
 [Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 225 solutions.
@@ -62,7 +62,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 
 | Year | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Benchmark (ms) | 24 | 120 | 89 | 35 | 16 | 272 | 9 | 8 | 6 | 6 |
+| Benchmark (ms) | 24 | 120 | 89 | 35 | 16 | 272 | 9 | 8 | 6 | 5 |
 
 ## 2024
 
@@ -75,7 +75,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 3 | [Mull It Over](https://adventofcode.com/2024/day/3) | [Source](src/year2024/day03.rs) | 8 |
 | 4 | [Ceres Search](https://adventofcode.com/2024/day/4) | [Source](src/year2024/day04.rs) | 77 |
 | 5 | [Print Queue](https://adventofcode.com/2024/day/5) | [Source](src/year2024/day05.rs) | 18 |
-| 6 | [Guard Gallivant](https://adventofcode.com/2024/day/6) | [Source](src/year2024/day06.rs) | 386 |
+| 6 | [Guard Gallivant](https://adventofcode.com/2024/day/6) | [Source](src/year2024/day06.rs) | 331 |
 | 7 | [Bridge Repair](https://adventofcode.com/2024/day/7) | [Source](src/year2024/day07.rs) | 136 |
 | 8 | [Resonant Collinearity](https://adventofcode.com/2024/day/8) | [Source](src/year2024/day08.rs) | 8 |
 | 9 | [Disk Fragmenter](https://adventofcode.com/2024/day/9) | [Source](src/year2024/day09.rs) | 106 |
@@ -89,9 +89,9 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 17 | [Chronospatial Computer](https://adventofcode.com/2024/day/17) | [Source](src/year2024/day17.rs) | 2 |
 | 18 | [RAM Run](https://adventofcode.com/2024/day/18) | [Source](src/year2024/day18.rs) | 42 |
 | 19 | [Linen Layout](https://adventofcode.com/2024/day/19) | [Source](src/year2024/day19.rs) | 118 |
-| 20 | [Race Condition](https://adventofcode.com/2024/day/20) | [Source](src/year2024/day20.rs) | 1354 |
+| 20 | [Race Condition](https://adventofcode.com/2024/day/20) | [Source](src/year2024/day20.rs) | 1038 |
 | 21 | [Keypad Conundrum](https://adventofcode.com/2024/day/21) | [Source](src/year2024/day21.rs) | 19 |
-| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 1350 |
+| 22 | [Monkey Market](https://adventofcode.com/2024/day/22) | [Source](src/year2024/day22.rs) | 1216 |
 | 23 | [LAN Party](https://adventofcode.com/2024/day/23) | [Source](src/year2024/day23.rs) | 43 |
 | 24 | [Crossed Wires](https://adventofcode.com/2024/day/24) | [Source](src/year2024/day24.rs) | 23 |
 | 25 | [Code Chronicle](https://adventofcode.com/2024/day/25) | [Source](src/year2024/day25.rs) | 8 |
@@ -113,7 +113,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 9 | [Mirage Maintenance](https://adventofcode.com/2023/day/9) | [Source](src/year2023/day09.rs) | 18 |
 | 10 | [Pipe Maze](https://adventofcode.com/2023/day/10) | [Source](src/year2023/day10.rs) | 41 |
 | 11 | [Cosmic Expansion](https://adventofcode.com/2023/day/11) | [Source](src/year2023/day11.rs) | 12 |
-| 12 | [Hot Springs](https://adventofcode.com/2023/day/12) | [Source](src/year2023/day12.rs) | 440 |
+| 12 | [Hot Springs](https://adventofcode.com/2023/day/12) | [Source](src/year2023/day12.rs) | 387 |
 | 13 | [Point of Incidence](https://adventofcode.com/2023/day/13) | [Source](src/year2023/day13.rs) | 66 |
 | 14 | [Parabolic Reflector Dish](https://adventofcode.com/2023/day/14) | [Source](src/year2023/day14.rs) | 632 |
 | 15 | [Lens Library](https://adventofcode.com/2023/day/15) | [Source](src/year2023/day15.rs) | 84 |
@@ -183,7 +183,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 15 | [Chiton](https://adventofcode.com/2021/day/15) | [Source](src/year2021/day15.rs) | 2403 |
 | 16 | [Packet Decoder](https://adventofcode.com/2021/day/16) | [Source](src/year2021/day16.rs) | 6 |
 | 17 | [Trick Shot](https://adventofcode.com/2021/day/17) | [Source](src/year2021/day17.rs) | 7 |
-| 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 501 |
+| 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 404 |
 | 19 | [Beacon Scanner](https://adventofcode.com/2021/day/19) | [Source](src/year2021/day19.rs) | 615 |
 | 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 2066 |
 | 21 | [Dirac Dice](https://adventofcode.com/2021/day/21) | [Source](src/year2021/day21.rs) | 278 |
@@ -272,7 +272,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 8 | [Memory Maneuver](https://adventofcode.com/2018/day/8) | [Source](src/year2018/day08.rs) | 24 |
 | 9 | [Marble Mania](https://adventofcode.com/2018/day/9) | [Source](src/year2018/day09.rs) | 909 |
 | 10 | [The Stars Align](https://adventofcode.com/2018/day/10) | [Source](src/year2018/day10.rs) | 11 |
-| 11 | [Chronal Charge](https://adventofcode.com/2018/day/11) | [Source](src/year2018/day11.rs) | 1404 |
+| 11 | [Chronal Charge](https://adventofcode.com/2018/day/11) | [Source](src/year2018/day11.rs) | 1156 |
 | 12 | [Subterranean Sustainability](https://adventofcode.com/2018/day/12) | [Source](src/year2018/day12.rs) | 77 |
 | 13 | [Mine Cart Madness](https://adventofcode.com/2018/day/13) | [Source](src/year2018/day13.rs) | 382 |
 | 14 | [Chocolate Charts](https://adventofcode.com/2018/day/14) | [Source](src/year2018/day14.rs) | 24000 |
diff --git a/docs/pie-2024.svg b/docs/pie-2024.svg
@@ -2,9 +2,9 @@
 %%{init: {"themeVariables": { "pie1": "#c6cdd5", "pie2": "#7cb5ec", "pie3": "#90ed7d"}}}%%
 pie
     title Year 2024 benchmark time by % of total
-    "Others" : 2724
-    "Day 20" : 1354
-    "Day 22" : 1350
+    "Others" : 2366
+    "Day 22" : 1216
+    "Day 20" : 1031
 -->
 <svg viewBox="0 0 1350 450" preserveAspectRatio="xMidYMin" xmlns="http://www.w3.org/2000/svg">
 	<style>
@@ -16,24 +16,24 @@ pie
 	</style>
 	<g transform="translate(675,225)">
 		<circle class="pieOuterCircle" cx="0" cy="0" r="186"/>
-		<path class="pieCircle" d="M0,-185A185,185,0,1,1,-2.141,184.988L0,0Z" fill="#c6cdd5"/>
-		<path class="pieCircle" d="M-2.141,184.988A185,185,0,0,1,-184.994,-1.499L0,0Z" fill="#7cb5ec"/>
-		<path class="pieCircle" d="M-184.994,-1.499A185,185,0,0,1,0,-185L0,0Z" fill="#90ed7d"/>
-		<text class="slice" style="text-anchor: middle;" transform="translate(138.74767607637392,0.8030463284372437)">50%</text>
-		<text class="slice" style="text-anchor: middle;" transform="translate(-99.07163385859252,97.14099991552993)">25%</text>
-		<text class="slice" style="text-anchor: middle;" transform="translate(-97.71277190513464,-98.50774947492778)">25%</text>
+		<path class="pieCircle" d="M0,-185A185,185,0,1,1,-14.976,184.393L0,0Z" fill="#c6cdd5"/>
+		<path class="pieCircle" d="M-14.976,184.393A185,185,0,0,1,-182.441,-30.663L0,0Z" fill="#7cb5ec"/>
+		<path class="pieCircle" d="M-182.441,-30.663A185,185,0,0,1,0,-185L0,0Z" fill="#90ed7d"/>
+		<text class="slice" style="text-anchor: middle;" transform="translate(138.63610351067555,5.620792059598803)">51%</text>
+		<text class="slice" style="text-anchor: middle;" transform="translate(-109.47338733801311,85.2475217513175)">26%</text>
+		<text class="slice" style="text-anchor: middle;" transform="translate(-89.61233438436842,-105.93012803817496)">22%</text>
 		<text class="pieTitleText" x="0" y="-200">Year 2024 benchmark time by % of total</text>
 		<g transform="translate(216,-33)">
 			<rect height="18" style="fill: rgb(198, 205, 213); stroke: rgb(198, 205, 213);" width="18"/>
 			<text x="22" y="14">Others</text>
 		</g>
 		<g transform="translate(216,-11)">
 			<rect height="18" style="fill: rgb(124, 181, 236); stroke: rgb(124, 181, 236);" width="18"/>
-			<text x="22" y="14">Day 20</text>
+			<text x="22" y="14">Day 22</text>
 		</g>
 		<g transform="translate(216,11)">
 			<rect height="18" style="fill: rgb(144, 237, 125); stroke: rgb(144, 237, 125);" width="18"/>
-			<text x="22" y="14">Day 22</text>
+			<text x="22" y="14">Day 20</text>
 		</g>
 	</g>
 </svg>
diff --git a/src/util/thread.rs b/src/util/thread.rs
@@ -2,13 +2,18 @@
 //! [scoped](https://doc.rust-lang.org/stable/std/thread/fn.scope.html)
 //! threads equals to the number of cores on the machine. Unlike normal threads, scoped threads
 //! can borrow data from their environment.
+use std::sync::atomic::{AtomicUsize, Ordering::Relaxed};
 use std::thread::*;
 
+// Usually the number of physical cores.
+fn threads() -> usize {
+    available_parallelism().unwrap().get()
+}
+
 /// Spawn `n` scoped threads, where `n` is the available parallelism.
-pub fn spawn<F, T>(f: F)
+pub fn spawn<F>(f: F)
 where
-    F: FnOnce() -> T + Copy + Send,
-    T: Send,
+    F: Fn() + Copy + Send,
 {
     scope(|scope| {
         for _ in 0..threads() {
@@ -17,31 +22,139 @@ where
     });
 }
 
-/// Splits `items` into batches, one per thread. Items are assigned in a round robin fashion,
-/// to achieve a crude load balacing in case some items are more complex to process than others.
-pub fn spawn_batches<F, T, U>(mut items: Vec<U>, f: F)
+/// Spawns `n` scoped threads that each receive a
+/// [work stealing](https://en.wikipedia.org/wiki/Work_stealing) iterator.
+/// Work stealing is an efficient strategy that keeps each CPU core busy when some items take longer
+/// than other to process, used by popular libraries such as [rayon](https://github.com/rayon-rs/rayon).
+/// Processing at different rates also happens on many modern CPUs with
+/// [heterogeneous performance and efficiency cores](https://en.wikipedia.org/wiki/ARM_big.LITTLE).
+pub fn spawn_parallel_iterator<F, T>(items: &[T], f: F)
 where
-    F: FnOnce(Vec<U>) -> T + Copy + Send,
-    T: Send,
-    U: Send,
+    F: Fn(ParIter<'_, T>) + Copy + Send,
+    T: Sync,
 {
     let threads = threads();
-    let mut batches: Vec<_> = (0..threads).map(|_| Vec::new()).collect();
-    let mut index = 0;
+    let size = items.len().div_ceil(threads);
 
-    // Round robin items over each thread.
-    while let Some(next) = items.pop() {
-        batches[index % threads].push(next);
-        index += 1;
-    }
+    // Initially divide work as evenly as possible amongst each worker thread.
+    let workers: Vec<_> = (0..threads)
+        .map(|id| {
+            let start = (id * size).min(items.len());
+            let end = (start + size).min(items.len());
+            CachePadding::new(pack(start, end))
+        })
+        .collect();
+    let workers = workers.as_slice();
 
     scope(|scope| {
-        for batch in batches {
-            scope.spawn(move || f(batch));
+        for id in 0..threads {
+            scope.spawn(move || f(ParIter { id, items, workers }));
         }
     });
 }
 
-fn threads() -> usize {
-    available_parallelism().unwrap().get()
+pub struct ParIter<'a, T> {
+    id: usize,
+    items: &'a [T],
+    workers: &'a [CachePadding],
+}
+
+impl<'a, T> Iterator for ParIter<'a, T> {
+    type Item = &'a T;
+
+    fn next(&mut self) -> Option<&'a T> {
+        // First try taking from our own queue.
+        let worker = &self.workers[self.id];
+        let current = worker.increment();
+        let (start, end) = unpack(current);
+
+        // There's still items to process.
+        if start < end {
+            return Some(&self.items[start]);
+        }
+
+        // Steal from another worker, [spinlocking](https://en.wikipedia.org/wiki/Spinlock)
+        // until we acquire new items to process or there's nothing left to do.
+        loop {
+            // Find worker with the most remaining items.
+            let available = self
+                .workers
+                .iter()
+                .filter_map(|other| {
+                    let current = other.load();
+                    let (start, end) = unpack(current);
+                    let size = end.saturating_sub(start);
+
+                    (size > 0).then_some((other, current, size))
+                })
+                .max_by_key(|t| t.2);
+
+            if let Some((other, current, size)) = available {
+                // Split the work items into two roughly equal piles.
+                let (start, end) = unpack(current);
+                let middle = start + size.div_ceil(2);
+
+                let next = pack(middle, end);
+                let stolen = pack(start + 1, middle);
+
+                // We could be preempted by another thread stealing or by the owning worker
+                // thread finishing an item, so check indices are still unmodified.
+                if other.compare_exchange(current, next) {
+                    worker.store(stolen);
+                    break Some(&self.items[start]);
+                }
+            } else {
+                // No work remaining.
+                break None;
+            }
+        }
+    }
+}
+
+/// Intentionally force alignment to 128 bytes to make a best effort attempt to place each atomic
+/// on its own cache line. This reduces contention and improves performance for common
+/// CPU caching protocols such as [MESI](https://en.wikipedia.org/wiki/MESI_protocol).
+#[repr(align(128))]
+pub struct CachePadding {
+    atomic: AtomicUsize,
+}
+
+/// Convenience wrapper methods around atomic operations. Both start and end indices are packed
+/// into a single atomic so that we can use the fastest and easiest to reason about `Relaxed`
+/// ordering.
+impl CachePadding {
+    #[inline]
+    fn new(n: usize) -> Self {
+        CachePadding { atomic: AtomicUsize::new(n) }
+    }
+
+    #[inline]
+    fn increment(&self) -> usize {
+        self.atomic.fetch_add(1, Relaxed)
+    }
+
+    #[inline]
+    fn load(&self) -> usize {
+        self.atomic.load(Relaxed)
+    }
+
+    #[inline]
+    fn store(&self, n: usize) {
+        self.atomic.store(n, Relaxed);
+    }
+
+    #[inline]
+    fn compare_exchange(&self, current: usize, new: usize) -> bool {
+        self.atomic.compare_exchange(current, new, Relaxed, Relaxed).is_ok()
+    }
+}
+
+#[inline]
+fn pack(start: usize, end: usize) -> usize {
+    (end << 32) | start
+}
+
+#[inline]
+fn unpack(both: usize) -> (usize, usize) {
+    (both & 0xffffffff, both >> 32)
 }
diff --git a/src/year2018/day11.rs b/src/year2018/day11.rs
@@ -43,15 +43,10 @@ pub fn parse(input: &str) -> Vec<Result> {
     }
 
     // Use as many cores as possible to parallelize the search.
-    // Smaller sizes take more time so keep batches roughly the same effort so that some
-    // threads are not finishing too soon and waiting idle, while others are still busy.
-    // For example if there are 4 cores, then they will be assigned sizes:
-    // * 1, 5, 9, ..
-    // * 2, 6, 10, ..
-    // * 3, 7, 11, ..
-    // * 4, 8, 12, ..
+    // Smaller sizes take more time so use work stealing to keep all cores busy.
+    let items: Vec<_> = (1..301).collect();
     let shared = Shared { sat, mutex: Mutex::new(Vec::new()) };
-    spawn_batches((1..301).collect(), |batch| worker(&shared, batch));
+    spawn_parallel_iterator(&items, |iter| worker(&shared, iter));
     shared.mutex.into_inner().unwrap()
 }
 
@@ -65,10 +60,9 @@ pub fn part2(input: &[Result]) -> String {
     format!("{x},{y},{size}")
 }
 
-fn worker(shared: &Shared, batch: Vec<usize>) {
-    let result: Vec<_> = batch
-        .into_iter()
-        .map(|size| {
+fn worker(shared: &Shared, iter: ParIter<'_, usize>) {
+    let result: Vec<_> = iter
+        .map(|&size| {
             let (power, x, y) = square(&shared.sat, size);
             Result { x, y, size, power }
         })
diff --git a/src/year2021/day18.rs b/src/year2021/day18.rs
@@ -83,18 +83,17 @@ pub fn part2(input: &[Snailfish]) -> i32 {
         }
     }
 
-    // Use as many cores as possible to parallelize the calculation,
-    // breaking the work into roughly equally size batches.
+    // Use as many cores as possible to parallelize the calculation.
     let shared = AtomicI32::new(0);
-    spawn_batches(pairs, |batch| worker(&shared, &batch));
+    spawn_parallel_iterator(&pairs, |iter| worker(&shared, iter));
     shared.load(Ordering::Relaxed)
 }
 
 /// Pair addition is independent so we can parallelize across multiple threads.
-fn worker(shared: &AtomicI32, batch: &[(&Snailfish, &Snailfish)]) {
+fn worker(shared: &AtomicI32, iter: ParIter<'_, (&Snailfish, &Snailfish)>) {
     let mut partial = 0;
 
-    for (a, b) in batch {
+    for (a, b) in iter {
         partial = partial.max(magnitude(&mut add(a, b)));
     }
 
diff --git a/src/year2023/day12.rs b/src/year2023/day12.rs
@@ -137,29 +137,31 @@ pub fn parse(input: &str) -> Vec<Spring<'_>> {
 }
 
 pub fn part1(input: &[Spring<'_>]) -> u64 {
-    solve(input, 1)
+    solve(input.iter(), 1)
 }
 
 pub fn part2(input: &[Spring<'_>]) -> u64 {
-    // Use as many cores as possible to parallelize the calculation,
-    // breaking the work into roughly equally size batches.
+    // Use as many cores as possible to parallelize the calculation.
     let shared = AtomicU64::new(0);
-    spawn_batches(input.to_vec(), |batch| {
-        let partial = solve(&batch, 5);
+    spawn_parallel_iterator(input, |iter| {
+        let partial = solve(iter, 5);
         shared.fetch_add(partial, Ordering::Relaxed);
     });
     shared.load(Ordering::Relaxed)
 }
 
-pub fn solve(input: &[Spring<'_>], repeat: usize) -> u64 {
+pub fn solve<'a, I>(iter: I, repeat: usize) -> u64
+where
+    I: Iterator<Item = &'a Spring<'a>>,
+{
     let mut result = 0;
     let mut pattern = Vec::new();
     let mut springs = Vec::new();
     // Exact size is not too important as long as there's enough space.
     let mut broken = vec![0; 200];
     let mut table = vec![0; 200 * 50];
 
-    for (first, second) in input {
+    for (first, second) in iter {
         // Create input sequence reusing the buffers to minimize memory allocations.
         pattern.clear();
         springs.clear();
diff --git a/src/year2024/day06.rs b/src/year2024/day06.rs
diff --git a/src/year2024/day20.rs b/src/year2024/day20.rs
diff --git a/src/year2024/day22.rs b/src/year2024/day22.rs

Original file line number	Diff line number	Diff line change
`@@ -83,18 +83,17 @@ pub fn part2(input: &[Snailfish]) -> i32 {`
`83`	`83`	`}`
`84`	`84`	`}`
`85`	`85`
`86`		`- // Use as many cores as possible to parallelize the calculation,`
`87`		`- // breaking the work into roughly equally size batches.`
	`86`	`+ // Use as many cores as possible to parallelize the calculation.`
`88`	`87`	`let shared = AtomicI32::new(0);`
`89`		`- spawn_batches(pairs, \|batch\| worker(&shared, &batch));`
	`88`	`+ spawn_parallel_iterator(&pairs, \|iter\| worker(&shared, iter));`
`90`	`89`	`shared.load(Ordering::Relaxed)`
`91`	`90`	`}`
`92`	`91`
`93`	`92`	`/// Pair addition is independent so we can parallelize across multiple threads.`
`94`		`-fn worker(shared: &AtomicI32, batch: &[(&Snailfish, &Snailfish)]) {`
	`93`	`+fn worker(shared: &AtomicI32, iter: ParIter<'_, (&Snailfish, &Snailfish)>) {`
`95`	`94`	`let mut partial = 0;`
`96`	`95`
`97`		`- for (a, b) in batch {`
	`96`	`+ for (a, b) in iter {`
`98`	`97`	`partial = partial.max(magnitude(&mut add(a, b)));`
`99`	`98`	`}`
`100`	`99`
-Original file line number
+Diff line change
     let shortcut = Shortcut::from(&grid);
     let total = AtomicUsize::new(0);
 -    spawn_batches(path, |batch| worker(&shortcut, &total, &batch));
 +    spawn_parallel_iterator(&path, |iter| worker(&shortcut, &total, iter));
     total.into_inner()
+}
 -fn worker(shortcut: &Shortcut, total: &AtomicUsize, batch: &[(Point, Point)]) {
 +fn worker(shortcut: &Shortcut, total: &AtomicUsize, iter: ParIter<'_, (Point, Point)>) {
     let mut seen = FastSet::new();
 -    let result = batch
 -        .iter()
 +    let result = iter
         .filter(|(position, direction)| {
             seen.clear();
             is_cycle(shortcut, &mut seen, *position, *direction)
Original file line number	Diff line number	Diff line change
`@@ -98,16 +98,17 @@ pub fn part2(time: &Grid<i32>) -> u32 {`
`98`	`98`	`}`
`99`	`99`	`}`
`100`	`100`
	`101`	`+ // Use as many cores as possible to parallelize the remaining search.`
`101`	`102`	`let total = AtomicU32::new(0);`
`102`		`- spawn_batches(items, \|batch\| worker(time, &total, batch));`
	`103`	`+ spawn_parallel_iterator(&items, \|iter\| worker(time, &total, iter));`
`103`	`104`	`total.into_inner()`
`104`	`105`	`}`
`105`	`106`
`106`		`-fn worker(time: &Grid<i32>, total: &AtomicU32, batch: Vec<Point>) {`
	`107`	`+fn worker(time: &Grid<i32>, total: &AtomicU32, iter: ParIter<'_, Point>) {`
`107`	`108`	`let mut cheats = 0;`
`108`	`109`
`109`	`110`	`// (p1, p2) is the reciprocal of (p2, p1) so we only need to check each pair once.`
`110`		`- for point in batch {`
	`111`	`+ for &point in iter {`
`111`	`112`	`for x in 2..21 {`
`112`	`113`	`cheats += check(time, point, Point::new(x, 0));`
`113`	`114`	`}`
-Original file line number
+Diff line change
+}
 pub fn parse(input: &str) -> Input {
 -    let numbers = input.iter_unsigned().collect();
 +    let numbers: Vec<_> = input.iter_unsigned().collect();
     let mutex = Mutex::new(Exclusive { part_one: 0, part_two: vec![0; 130321] });
 -    spawn_batches(numbers, |batch| worker(&mutex, &batch));
 +    // Use as many cores as possible to parallelize the remaining search.
 +    spawn_parallel_iterator(&numbers, |iter| worker(&mutex, iter));
     let Exclusive { part_one, part_two } = mutex.into_inner().unwrap();
     (part_one, *part_two.iter().max().unwrap())
     input.1
+}
 -fn worker(mutex: &Mutex<Exclusive>, batch: &[usize]) {
 +fn worker(mutex: &Mutex<Exclusive>, iter: ParIter<'_, usize>) {
     let mut part_one = 0;
     let mut part_two = vec![0; 130321];
     let mut seen = vec![u16::MAX; 130321];
 -    for (id, number) in batch.iter().enumerate() {
 +    for (id, number) in iter.enumerate() {
         let id = id as u16;
         let zeroth = *number;