From 032f0bb5e5d5c5d982252f3d986359772eaa3610 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Sat, 27 Jan 2018 15:57:16 -0500 Subject: [PATCH 1/6] Speed up count_hyper and num_chars_hyper Using `faster` yields a serious speedup over `simd`, and removes the need for feature segmentation. faster kabylake running 15 tests test bench_count_30000_hyper ... bench: 1,172 ns/iter (+/- 139) test bench_count_big_0100000_hyper ... bench: 3,892 ns/iter (+/- 9) test bench_count_big_1000000_hyper ... bench: 38,850 ns/iter (+/- 10,860) test bench_num_chars_30000_hyper ... bench: 805 ns/iter (+/- 133) test bench_num_chars_big_0100000_hyper ... bench: 3,059 ns/iter (+/- 464) test bench_num_chars_big_1000000_hyper ... bench: 34,522 ns/iter (+/- 2,631) faster nehalem test bench_count_30000_hyper ... bench: 2,478 ns/iter (+/- 328) test bench_count_big_0100000_hyper ... bench: 8,079 ns/iter (+/- 3) test bench_count_big_1000000_hyper ... bench: 80,784 ns/iter (+/- 24,800) test bench_num_chars_30000_hyper ... bench: 1,380 ns/iter (+/- 10) test bench_num_chars_big_0100000_hyper ... bench: 4,837 ns/iter (+/- 49) test bench_num_chars_big_1000000_hyper ... bench: 50,925 ns/iter (+/- 7,802) faster x86-64 test bench_count_30000_hyper ... bench: 2,347 ns/iter (+/- 2) test bench_count_big_0100000_hyper ... bench: 8,325 ns/iter (+/- 2,010) test bench_count_big_1000000_hyper ... bench: 83,036 ns/iter (+/- 14,618) test bench_num_chars_30000_hyper ... bench: 1,474 ns/iter (+/- 350) test bench_num_chars_big_0100000_hyper ... bench: 4,695 ns/iter (+/- 56) test bench_num_chars_big_1000000_hyper ... bench: 51,214 ns/iter (+/- 22,773) faster pentium test bench_count_30000_hyper ... bench: 50,619 ns/iter (+/- 27) test bench_count_big_0100000_hyper ... bench: 168,750 ns/iter (+/- 2,620) test bench_count_big_1000000_hyper ... bench: 1,793,224 ns/iter (+/- 184,043) test bench_num_chars_30000_hyper ... bench: 1,441 ns/iter (+/- 2) test bench_num_chars_big_0100000_hyper ... bench: 5,045 ns/iter (+/- 79) test bench_num_chars_big_1000000_hyper ... bench: 52,989 ns/iter (+/- 4,530) simd kabylake test bench_count_30000_hyper ... bench: 1,658 ns/iter (+/- 71) test bench_count_big_0100000_hyper ... bench: 5,999 ns/iter (+/- 11) test bench_count_big_1000000_hyper ... bench: 61,536 ns/iter (+/- 2,047) test bench_num_chars_30000_hyper ... bench: 5,506 ns/iter (+/- 176) test bench_num_chars_big_0100000_hyper ... bench: 19,045 ns/iter (+/- 2,317) test bench_num_chars_big_1000000_hyper ... bench: 190,179 ns/iter (+/- 6,082) simd nehalem test bench_count_30000_hyper ... bench: 2,011 ns/iter (+/- 33) test bench_count_big_0100000_hyper ... bench: 7,728 ns/iter (+/- 307) test bench_count_big_1000000_hyper ... bench: 77,853 ns/iter (+/- 5,531) test bench_num_chars_30000_hyper ... bench: 988 ns/iter (+/- 7) test bench_num_chars_big_0100000_hyper ... bench: 4,137 ns/iter (+/- 528) test bench_num_chars_big_1000000_hyper ... bench: 45,211 ns/iter (+/- 6,833) simd x86-64 test bench_count_30000_hyper ... bench: 2,286 ns/iter (+/- 313) test bench_count_big_0100000_hyper ... bench: 7,610 ns/iter (+/- 898) test bench_count_big_1000000_hyper ... bench: 79,711 ns/iter (+/- 5,352) test bench_num_chars_30000_hyper ... bench: 987 ns/iter (+/- 30) test bench_num_chars_big_0100000_hyper ... bench: 3,985 ns/iter (+/- 248) test bench_num_chars_big_1000000_hyper ... bench: 43,328 ns/iter (+/- 1,382) simd pentium error[E0432]: unresolved import `x86::sse2` --> /home/adam/.cargo/registry/src/github.com-1ecc6299db9ec823/simd-0.2.1/src/common.rs:16:10 | 16 | use x86::sse2::common; | ^^^^ Could not find `sse2` in `x86` error: aborting due to previous error error: Could not compile `simd`. warning: build failed, waiting for other jobs to finish... error: build failed --- Cargo.toml | 5 +- src/lib.rs | 149 +++++++---------------------------------------------- 2 files changed, 22 insertions(+), 132 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ef72107..678f1a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,11 +15,10 @@ appveyor = { repository = "llogiq/bytecount" } bench = false [features] -avx-accel = ["simd-accel"] -simd-accel = ["simd"] +simd-accel = ["faster"] [dependencies] -simd = { version = "0.2.0", optional = true } +faster = { version = "0.4.2", optional=true } [dev-dependencies] quickcheck = "0.6" diff --git a/src/lib.rs b/src/lib.rs index 8a59486..420eb9d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,17 +37,11 @@ #![deny(missing_docs)] #[cfg(feature = "simd-accel")] -extern crate simd; - -use core::{cmp, mem, slice, usize}; - +extern crate faster; #[cfg(feature = "simd-accel")] -use simd::u8x16; -#[cfg(feature = "avx-accel")] -use simd::x86::sse2::Sse2U8x16; -#[cfg(feature = "avx-accel")] -use simd::x86::avx::{LowHigh128, u8x32}; +use faster::*; +use core::{cmp, mem, ops, slice, usize}; trait ByteChunk: Copy { type Splat: Copy; @@ -100,78 +94,6 @@ impl ByteChunk for usize { } } -#[cfg(feature = "simd-accel")] -impl ByteChunk for u8x16 { - type Splat = Self; - - fn splat(byte: u8) -> Self { - Self::splat(byte) - } - - fn from_splat(splat: Self) -> Self { - splat - } - - fn is_leading_utf8_byte(self) -> Self { - (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8() - } - - fn bytewise_equal(self, other: Self) -> Self { - self.eq(other).to_repr().to_u8() - } - - fn increment(self, incr: Self) -> Self { - // incr on -1 - self - incr - } - - fn sum(&self) -> usize { - let mut count = 0; - for i in 0..16 { - count += self.extract(i) as usize; - } - count - } -} - -#[cfg(feature = "avx-accel")] -impl ByteChunk for u8x32 { - type Splat = Self; - - fn splat(byte: u8) -> Self { - Self::splat(byte) - } - - fn from_splat(splat: Self) -> Self { - splat - } - - fn is_leading_utf8_byte(self) -> Self { - (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8() - } - - fn bytewise_equal(self, other: Self) -> Self { - self.eq(other).to_repr().to_u8() - } - - fn increment(self, incr: Self) -> Self { - // incr on -1 - self - incr - } - - fn sum(&self) -> usize { - let zero = u8x16::splat(0); - let sad_lo = self.low().sad(zero); - let sad_hi = self.high().sad(zero); - - let mut count = 0; - count += (sad_lo.extract(0) + sad_lo.extract(1)) as usize; - count += (sad_hi.extract(0) + sad_hi.extract(1)) as usize; - count - } -} - - fn chunk_align(x: &[u8]) -> (&[u8], &[Chunk], &[u8]) { let align = mem::size_of::(); @@ -254,23 +176,21 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { /// let number_of_spaces = bytecount::count(s, b' '); /// assert_eq!(number_of_spaces, 5); /// ``` -#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))] -pub fn count(haystack: &[u8], needle: u8) -> usize { - count_generic::(32, haystack, needle) -} - -/// Count occurrences of a byte in a slice of bytes, fast -/// -/// # Examples -/// -/// ``` -/// let s = b"This is a Text with spaces"; -/// let number_of_spaces = bytecount::count(s, b' '); -/// assert_eq!(number_of_spaces, 5); -/// ``` -#[cfg(feature = "avx-accel")] +#[cfg(feature = "simd-accel")] pub fn count(haystack: &[u8], needle: u8) -> usize { - count_generic::(64, haystack, needle) + let mut ret: usize = 0; + let mut i = 0; + let mut acc = u8s(0); + haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| { + i += 1; + acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)); + if i == 255 { + ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize)); + acc = u8s(0); + i = 0; + } + }); + ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize)) } /// Count up to `(2^32)-1` occurrences of a byte in a slice @@ -333,7 +253,6 @@ fn num_chars_generic>(naive_below: usize, haysta count } - /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast /// /// This function is safe to use on any byte array, valid UTF-8 or not, @@ -346,46 +265,18 @@ fn num_chars_generic>(naive_below: usize, haysta /// let char_count = bytecount::num_chars(swordfish.as_bytes()); /// assert_eq!(char_count, 4); /// ``` -#[cfg(not(feature = "simd-accel"))] pub fn num_chars(haystack: &[u8]) -> usize { // Never use [usize; 4] num_chars_generic::(32, haystack) } -/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast -/// -/// This function is safe to use on any byte array, valid UTF-8 or not, -/// but the output is only meaningful for well-formed UTF-8. -/// -/// # Example -/// -/// ``` -/// let swordfish = "メカジキ"; -/// let char_count = bytecount::num_chars(swordfish.as_bytes()); -/// assert_eq!(char_count, 4); -/// ``` -#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))] +#[cfg(feature = "simd-accel")] pub fn num_chars(haystack: &[u8]) -> usize { - num_chars_generic::(32, haystack) -} -/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast -/// -/// This function is safe to use on any byte array, valid UTF-8 or not, -/// but the output is only meaningful for well-formed UTF-8. -/// -/// # Example -/// -/// ``` -/// let swordfish = "メカジキ"; -/// let char_count = bytecount::num_chars(swordfish.as_bytes()); -/// assert_eq!(char_count, 4); -/// ``` -#[cfg(feature = "avx-accel")] -pub fn num_chars(haystack: &[u8]) -> usize { - num_chars_generic::(64, haystack) + } + /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, simple /// /// This function is safe to use on any byte array, valid UTF-8 or not, From ecd5edffc62bff95ea1d226343b2cd42c13b985a Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Sat, 27 Jan 2018 16:44:32 -0500 Subject: [PATCH 2/6] Use naive method for small slices Even without faster, the "hyper" method of counting is slower for slices with a small size. --- Cargo.toml | 2 +- src/lib.rs | 30 +++++++++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 678f1a9..1b29a72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ bench = false simd-accel = ["faster"] [dependencies] -faster = { version = "0.4.2", optional=true } +faster = { version = "0.4.2", optional = true } [dev-dependencies] quickcheck = "0.6" diff --git a/src/lib.rs b/src/lib.rs index 420eb9d..03a73a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -178,19 +178,23 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { /// ``` #[cfg(feature = "simd-accel")] pub fn count(haystack: &[u8], needle: u8) -> usize { - let mut ret: usize = 0; - let mut i = 0; - let mut acc = u8s(0); - haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| { - i += 1; - acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)); - if i == 255 { - ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize)); - acc = u8s(0); - i = 0; - } - }); - ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize)) + if haystack.len() < 100 { + naive_count(haystack, needle) + } else { + let mut ret = 0; + let mut i = 0; + let mut acc = u8s(0); + haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| { + i += 1; + acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)); + if i == 255 { + ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize)); + acc = u8s(0); + i = 0; + } + }); + ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize)) + } } /// Count up to `(2^32)-1` occurrences of a byte in a slice From 1fc04c6992acaf3e75a7acf8f00b5a04c6795253 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Sun, 28 Jan 2018 15:53:53 -0500 Subject: [PATCH 3/6] Update faster, README, tests --- .travis.yml | 9 +++++++-- Cargo.toml | 2 +- README.md | 17 +++++------------ 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0f166dc..54046f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,13 @@ matrix: - rust: nightly env: - ARCH=x86_64 - - FEATURES="--features avx-accel" - - RUSTFLAGS="-C target-feature=+avx" + - FEATURES="--features simd-accel" + - RUSTFLAGS="-C target-feature=+sse4.2" + - rust: nightly + env: + - ARCH=x86_64 + - FEATURES="--features simd-accel" + - RUSTFLAGS="-C target-cpu=native" addons: apt: packages: diff --git a/Cargo.toml b/Cargo.toml index 1b29a72..a921ee6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ bench = false simd-accel = ["faster"] [dependencies] -faster = { version = "0.4.2", optional = true } +faster = { version = "0.4.3", optional = true } [dev-dependencies] quickcheck = "0.6" diff --git a/README.md b/README.md index a5b09ef..c94fee6 100644 --- a/README.md +++ b/README.md @@ -28,28 +28,21 @@ fn main() { } ``` -bytecount supports two features to make use of modern CPU's features to speed up counting considerably. To allow your -users to use them, add the following to your `Cargo.toml`: +bytecount makes use of features in modern CPUs to speed up counting considerably. To use these features, +add the following to your `Cargo.toml`: ``` [features] -avx-accel = ["bytecount/avx-accel"] simd-accel = ["bytecount/simd-accel"] ``` -Now your users can compile with SSE support (available on most modern x86_64 processors) using: +Now your users can compile with SIMD support, regardless of processor type, using: ``` -cargo build --release --features simd-accel +RUSTFLAGS="-C target-cpu=native" cargo build --release --features simd-accel ``` -Or even with AVX support (which likely requires compiling for the native target CPU): - -``` -RUSTFLAGS="-C target-cpu=native" cargo build --release --features "simd-accel avx-accel" -``` - -The algorithm is explained in depth +The scalar algorithm is explained in depth [here](https://llogiq.github.io/2016/09/27/count.html). Note that for very short slices, the data parallelism will likely not win much performance gains. In those cases, a naive From e807753b9f4e16eec5b2c4845eb5373f1e286292 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Tue, 30 Jan 2018 15:56:14 -0500 Subject: [PATCH 4/6] Update core SIMD algorithm Takes us from 30kns -> 18.9kns --- src/lib.rs | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 03a73a0..11363ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -182,18 +182,19 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { naive_count(haystack, needle) } else { let mut ret = 0; - let mut i = 0; - let mut acc = u8s(0); - haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| { - i += 1; - acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)); - if i == 255 { - ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize)); - acc = u8s(0); - i = 0; - } - }); - ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize)) + + for i in 0..haystack.len() / (u8s::WIDTH * 255) { + ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255]) + .simd_iter() + .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| { + acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)); + } + ret + (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..]) + .simd_iter() + .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| { + acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)) } } From bcd1902a6557a2d82942571c34e448acb849d441 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Tue, 30 Jan 2018 16:51:10 -0500 Subject: [PATCH 5/6] Fix appveyor? --- appveyor.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 5f861bd..27056cc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,8 +15,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: i686-pc-windows-gnu CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: i686-pc-windows-msvc CHANNEL: stable - TARGET: i686-pc-windows-msvc @@ -28,8 +28,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: i686-pc-windows-msvc CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: x86_64-pc-windows-gnu CHANNEL: stable - TARGET: x86_64-pc-windows-gnu @@ -41,8 +41,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: x86_64-pc-windows-gnu CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: x86_64-pc-windows-msvc CHANNEL: stable - TARGET: x86_64-pc-windows-msvc @@ -54,8 +54,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: x86_64-pc-windows-msvc CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" install: - curl -sSf -o rustup-init.exe https://win.rustup.rs/ From ccaa4583961490d9f9558c1389462d383a7c6013 Mon Sep 17 00:00:00 2001 From: Adam Niederer Date: Tue, 30 Jan 2018 17:21:05 -0500 Subject: [PATCH 6/6] Add faster-accelerated num_chars --- src/lib.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 11363ed..62b3bfb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,7 +41,7 @@ extern crate faster; #[cfg(feature = "simd-accel")] use faster::*; -use core::{cmp, mem, ops, slice, usize}; +use core::{cmp, mem, slice, usize}; trait ByteChunk: Copy { type Splat: Copy; @@ -270,15 +270,33 @@ fn num_chars_generic>(naive_below: usize, haysta /// let char_count = bytecount::num_chars(swordfish.as_bytes()); /// assert_eq!(char_count, 4); /// ``` +#[cfg(not(feature = "simd-accel"))] pub fn num_chars(haystack: &[u8]) -> usize { // Never use [usize; 4] num_chars_generic::(32, haystack) } +/// f #[cfg(feature = "simd-accel")] pub fn num_chars(haystack: &[u8]) -> usize { + if haystack.len() < 100 { + naive_num_chars(haystack) + } else { + let mut ret = 0; - + for i in 0..haystack.len() / (u8s::WIDTH * 255) { + ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255]) + .simd_iter() + .simd_reduce(u8s(0), u8s(0), |acc, v| { + acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)); + } + haystack.len() - ret - (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..]) + .simd_iter() + .simd_reduce(u8s(0), u8s(0), |acc, v| { + acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)) + } }