diff --git a/.travis.yml b/.travis.yml index 0f166dc..54046f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,13 @@ matrix: - rust: nightly env: - ARCH=x86_64 - - FEATURES="--features avx-accel" - - RUSTFLAGS="-C target-feature=+avx" + - FEATURES="--features simd-accel" + - RUSTFLAGS="-C target-feature=+sse4.2" + - rust: nightly + env: + - ARCH=x86_64 + - FEATURES="--features simd-accel" + - RUSTFLAGS="-C target-cpu=native" addons: apt: packages: diff --git a/Cargo.toml b/Cargo.toml index ef72107..a921ee6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,11 +15,10 @@ appveyor = { repository = "llogiq/bytecount" } bench = false [features] -avx-accel = ["simd-accel"] -simd-accel = ["simd"] +simd-accel = ["faster"] [dependencies] -simd = { version = "0.2.0", optional = true } +faster = { version = "0.4.3", optional = true } [dev-dependencies] quickcheck = "0.6" diff --git a/README.md b/README.md index a5b09ef..c94fee6 100644 --- a/README.md +++ b/README.md @@ -28,28 +28,21 @@ fn main() { } ``` -bytecount supports two features to make use of modern CPU's features to speed up counting considerably. To allow your -users to use them, add the following to your `Cargo.toml`: +bytecount makes use of features in modern CPUs to speed up counting considerably. To use these features, +add the following to your `Cargo.toml`: ``` [features] -avx-accel = ["bytecount/avx-accel"] simd-accel = ["bytecount/simd-accel"] ``` -Now your users can compile with SSE support (available on most modern x86_64 processors) using: +Now your users can compile with SIMD support, regardless of processor type, using: ``` -cargo build --release --features simd-accel +RUSTFLAGS="-C target-cpu=native" cargo build --release --features simd-accel ``` -Or even with AVX support (which likely requires compiling for the native target CPU): - -``` -RUSTFLAGS="-C target-cpu=native" cargo build --release --features "simd-accel avx-accel" -``` - -The algorithm is explained in depth +The scalar algorithm is explained in depth [here](https://llogiq.github.io/2016/09/27/count.html). Note that for very short slices, the data parallelism will likely not win much performance gains. In those cases, a naive diff --git a/appveyor.yml b/appveyor.yml index 5f861bd..27056cc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,8 +15,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: i686-pc-windows-gnu CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: i686-pc-windows-msvc CHANNEL: stable - TARGET: i686-pc-windows-msvc @@ -28,8 +28,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: i686-pc-windows-msvc CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: x86_64-pc-windows-gnu CHANNEL: stable - TARGET: x86_64-pc-windows-gnu @@ -41,8 +41,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: x86_64-pc-windows-gnu CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" - TARGET: x86_64-pc-windows-msvc CHANNEL: stable - TARGET: x86_64-pc-windows-msvc @@ -54,8 +54,8 @@ environment: FEATURES: "--features simd-accel" - TARGET: x86_64-pc-windows-msvc CHANNEL: nightly - FEATURES: "--features avx-accel" - RUSTFLAGS: "-C target-feature=+avx" + FEATURES: "--features simd-accel" + RUSTFLAGS: "-C target-cpu=native" install: - curl -sSf -o rustup-init.exe https://win.rustup.rs/ diff --git a/src/lib.rs b/src/lib.rs index 8a59486..62b3bfb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,17 +37,11 @@ #![deny(missing_docs)] #[cfg(feature = "simd-accel")] -extern crate simd; - -use core::{cmp, mem, slice, usize}; - +extern crate faster; #[cfg(feature = "simd-accel")] -use simd::u8x16; -#[cfg(feature = "avx-accel")] -use simd::x86::sse2::Sse2U8x16; -#[cfg(feature = "avx-accel")] -use simd::x86::avx::{LowHigh128, u8x32}; +use faster::*; +use core::{cmp, mem, slice, usize}; trait ByteChunk: Copy { type Splat: Copy; @@ -100,78 +94,6 @@ impl ByteChunk for usize { } } -#[cfg(feature = "simd-accel")] -impl ByteChunk for u8x16 { - type Splat = Self; - - fn splat(byte: u8) -> Self { - Self::splat(byte) - } - - fn from_splat(splat: Self) -> Self { - splat - } - - fn is_leading_utf8_byte(self) -> Self { - (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8() - } - - fn bytewise_equal(self, other: Self) -> Self { - self.eq(other).to_repr().to_u8() - } - - fn increment(self, incr: Self) -> Self { - // incr on -1 - self - incr - } - - fn sum(&self) -> usize { - let mut count = 0; - for i in 0..16 { - count += self.extract(i) as usize; - } - count - } -} - -#[cfg(feature = "avx-accel")] -impl ByteChunk for u8x32 { - type Splat = Self; - - fn splat(byte: u8) -> Self { - Self::splat(byte) - } - - fn from_splat(splat: Self) -> Self { - splat - } - - fn is_leading_utf8_byte(self) -> Self { - (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8() - } - - fn bytewise_equal(self, other: Self) -> Self { - self.eq(other).to_repr().to_u8() - } - - fn increment(self, incr: Self) -> Self { - // incr on -1 - self - incr - } - - fn sum(&self) -> usize { - let zero = u8x16::splat(0); - let sad_lo = self.low().sad(zero); - let sad_hi = self.high().sad(zero); - - let mut count = 0; - count += (sad_lo.extract(0) + sad_lo.extract(1)) as usize; - count += (sad_hi.extract(0) + sad_hi.extract(1)) as usize; - count - } -} - - fn chunk_align(x: &[u8]) -> (&[u8], &[Chunk], &[u8]) { let align = mem::size_of::(); @@ -254,23 +176,26 @@ pub fn count(haystack: &[u8], needle: u8) -> usize { /// let number_of_spaces = bytecount::count(s, b' '); /// assert_eq!(number_of_spaces, 5); /// ``` -#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))] -pub fn count(haystack: &[u8], needle: u8) -> usize { - count_generic::(32, haystack, needle) -} - -/// Count occurrences of a byte in a slice of bytes, fast -/// -/// # Examples -/// -/// ``` -/// let s = b"This is a Text with spaces"; -/// let number_of_spaces = bytecount::count(s, b' '); -/// assert_eq!(number_of_spaces, 5); -/// ``` -#[cfg(feature = "avx-accel")] +#[cfg(feature = "simd-accel")] pub fn count(haystack: &[u8], needle: u8) -> usize { - count_generic::(64, haystack, needle) + if haystack.len() < 100 { + naive_count(haystack, needle) + } else { + let mut ret = 0; + + for i in 0..haystack.len() / (u8s::WIDTH * 255) { + ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255]) + .simd_iter() + .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| { + acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)); + } + ret + (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..]) + .simd_iter() + .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| { + acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)) + } } /// Count up to `(2^32)-1` occurrences of a byte in a slice @@ -333,7 +258,6 @@ fn num_chars_generic>(naive_below: usize, haysta count } - /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast /// /// This function is safe to use on any byte array, valid UTF-8 or not, @@ -352,39 +276,29 @@ pub fn num_chars(haystack: &[u8]) -> usize { num_chars_generic::(32, haystack) } -/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast -/// -/// This function is safe to use on any byte array, valid UTF-8 or not, -/// but the output is only meaningful for well-formed UTF-8. -/// -/// # Example -/// -/// ``` -/// let swordfish = "メカジキ"; -/// let char_count = bytecount::num_chars(swordfish.as_bytes()); -/// assert_eq!(char_count, 4); -/// ``` -#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))] +/// f +#[cfg(feature = "simd-accel")] pub fn num_chars(haystack: &[u8]) -> usize { - num_chars_generic::(32, haystack) + if haystack.len() < 100 { + naive_num_chars(haystack) + } else { + let mut ret = 0; + + for i in 0..haystack.len() / (u8s::WIDTH * 255) { + ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255]) + .simd_iter() + .simd_reduce(u8s(0), u8s(0), |acc, v| { + acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)); + } + haystack.len() - ret - (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..]) + .simd_iter() + .simd_reduce(u8s(0), u8s(0), |acc, v| { + acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01)) + }).scalar_reduce(0, |acc, s| acc + (s as usize)) + } } -/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast -/// -/// This function is safe to use on any byte array, valid UTF-8 or not, -/// but the output is only meaningful for well-formed UTF-8. -/// -/// # Example -/// -/// ``` -/// let swordfish = "メカジキ"; -/// let char_count = bytecount::num_chars(swordfish.as_bytes()); -/// assert_eq!(char_count, 4); -/// ``` -#[cfg(feature = "avx-accel")] -pub fn num_chars(haystack: &[u8]) -> usize { - num_chars_generic::(64, haystack) -} /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, simple ///