Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up count_hyper and num_chars_hyper #35

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ matrix:
- rust: nightly
env:
- ARCH=x86_64
- FEATURES="--features avx-accel"
- RUSTFLAGS="-C target-feature=+avx"
- FEATURES="--features simd-accel"
- RUSTFLAGS="-C target-feature=+sse4.2"
- rust: nightly
env:
- ARCH=x86_64
- FEATURES="--features simd-accel"
- RUSTFLAGS="-C target-cpu=native"
addons:
apt:
packages:
Expand Down
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@ appveyor = { repository = "llogiq/bytecount" }
bench = false

[features]
avx-accel = ["simd-accel"]
simd-accel = ["simd"]
simd-accel = ["faster"]

[dependencies]
simd = { version = "0.2.0", optional = true }
faster = { version = "0.4.3", optional = true }

[dev-dependencies]
quickcheck = "0.6"
Expand Down
17 changes: 5 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,21 @@ fn main() {
}
```

bytecount supports two features to make use of modern CPU's features to speed up counting considerably. To allow your
users to use them, add the following to your `Cargo.toml`:
bytecount makes use of features in modern CPUs to speed up counting considerably. To use these features,
add the following to your `Cargo.toml`:

```
[features]
avx-accel = ["bytecount/avx-accel"]
simd-accel = ["bytecount/simd-accel"]
```

Now your users can compile with SSE support (available on most modern x86_64 processors) using:
Now your users can compile with SIMD support, regardless of processor type, using:

```
cargo build --release --features simd-accel
RUSTFLAGS="-C target-cpu=native" cargo build --release --features simd-accel
```

Or even with AVX support (which likely requires compiling for the native target CPU):

```
RUSTFLAGS="-C target-cpu=native" cargo build --release --features "simd-accel avx-accel"
```

The algorithm is explained in depth
The scalar algorithm is explained in depth
[here](https://llogiq.github.io/2016/09/27/count.html).

Note that for very short slices, the data parallelism will likely not win much performance gains. In those cases, a naive
Expand Down
16 changes: 8 additions & 8 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ environment:
FEATURES: "--features simd-accel"
- TARGET: i686-pc-windows-gnu
CHANNEL: nightly
FEATURES: "--features avx-accel"
RUSTFLAGS: "-C target-feature=+avx"
FEATURES: "--features simd-accel"
RUSTFLAGS: "-C target-cpu=native"
- TARGET: i686-pc-windows-msvc
CHANNEL: stable
- TARGET: i686-pc-windows-msvc
Expand All @@ -28,8 +28,8 @@ environment:
FEATURES: "--features simd-accel"
- TARGET: i686-pc-windows-msvc
CHANNEL: nightly
FEATURES: "--features avx-accel"
RUSTFLAGS: "-C target-feature=+avx"
FEATURES: "--features simd-accel"
RUSTFLAGS: "-C target-cpu=native"
- TARGET: x86_64-pc-windows-gnu
CHANNEL: stable
- TARGET: x86_64-pc-windows-gnu
Expand All @@ -41,8 +41,8 @@ environment:
FEATURES: "--features simd-accel"
- TARGET: x86_64-pc-windows-gnu
CHANNEL: nightly
FEATURES: "--features avx-accel"
RUSTFLAGS: "-C target-feature=+avx"
FEATURES: "--features simd-accel"
RUSTFLAGS: "-C target-cpu=native"
- TARGET: x86_64-pc-windows-msvc
CHANNEL: stable
- TARGET: x86_64-pc-windows-msvc
Expand All @@ -54,8 +54,8 @@ environment:
FEATURES: "--features simd-accel"
- TARGET: x86_64-pc-windows-msvc
CHANNEL: nightly
FEATURES: "--features avx-accel"
RUSTFLAGS: "-C target-feature=+avx"
FEATURES: "--features simd-accel"
RUSTFLAGS: "-C target-cpu=native"

install:
- curl -sSf -o rustup-init.exe https://win.rustup.rs/
Expand Down
170 changes: 42 additions & 128 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,11 @@
#![deny(missing_docs)]

#[cfg(feature = "simd-accel")]
extern crate simd;

use core::{cmp, mem, slice, usize};

extern crate faster;
#[cfg(feature = "simd-accel")]
use simd::u8x16;
#[cfg(feature = "avx-accel")]
use simd::x86::sse2::Sse2U8x16;
#[cfg(feature = "avx-accel")]
use simd::x86::avx::{LowHigh128, u8x32};
use faster::*;

use core::{cmp, mem, slice, usize};

trait ByteChunk: Copy {
type Splat: Copy;
Expand Down Expand Up @@ -100,78 +94,6 @@ impl ByteChunk for usize {
}
}

#[cfg(feature = "simd-accel")]
impl ByteChunk for u8x16 {
type Splat = Self;

fn splat(byte: u8) -> Self {
Self::splat(byte)
}

fn from_splat(splat: Self) -> Self {
splat
}

fn is_leading_utf8_byte(self) -> Self {
(self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
}

fn bytewise_equal(self, other: Self) -> Self {
self.eq(other).to_repr().to_u8()
}

fn increment(self, incr: Self) -> Self {
// incr on -1
self - incr
}

fn sum(&self) -> usize {
let mut count = 0;
for i in 0..16 {
count += self.extract(i) as usize;
}
count
}
}

#[cfg(feature = "avx-accel")]
impl ByteChunk for u8x32 {
type Splat = Self;

fn splat(byte: u8) -> Self {
Self::splat(byte)
}

fn from_splat(splat: Self) -> Self {
splat
}

fn is_leading_utf8_byte(self) -> Self {
(self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
}

fn bytewise_equal(self, other: Self) -> Self {
self.eq(other).to_repr().to_u8()
}

fn increment(self, incr: Self) -> Self {
// incr on -1
self - incr
}

fn sum(&self) -> usize {
let zero = u8x16::splat(0);
let sad_lo = self.low().sad(zero);
let sad_hi = self.high().sad(zero);

let mut count = 0;
count += (sad_lo.extract(0) + sad_lo.extract(1)) as usize;
count += (sad_hi.extract(0) + sad_hi.extract(1)) as usize;
count
}
}


fn chunk_align<Chunk: ByteChunk>(x: &[u8]) -> (&[u8], &[Chunk], &[u8]) {
let align = mem::size_of::<Chunk>();

Expand Down Expand Up @@ -254,23 +176,26 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
/// let number_of_spaces = bytecount::count(s, b' ');
/// assert_eq!(number_of_spaces, 5);
/// ```
#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
pub fn count(haystack: &[u8], needle: u8) -> usize {
count_generic::<u8x16>(32, haystack, needle)
}

/// Count occurrences of a byte in a slice of bytes, fast
///
/// # Examples
///
/// ```
/// let s = b"This is a Text with spaces";
/// let number_of_spaces = bytecount::count(s, b' ');
/// assert_eq!(number_of_spaces, 5);
/// ```
#[cfg(feature = "avx-accel")]
#[cfg(feature = "simd-accel")]
pub fn count(haystack: &[u8], needle: u8) -> usize {
count_generic::<u8x32>(64, haystack, needle)
if haystack.len() < 100 {
naive_count(haystack, needle)
} else {
let mut ret = 0;

for i in 0..haystack.len() / (u8s::WIDTH * 255) {
ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
.simd_iter()
.simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
}).scalar_reduce(0, |acc, s| acc + (s as usize));
}
ret + (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
.simd_iter()
.simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
}).scalar_reduce(0, |acc, s| acc + (s as usize))
}
}

/// Count up to `(2^32)-1` occurrences of a byte in a slice
Expand Down Expand Up @@ -333,7 +258,6 @@ fn num_chars_generic<Chunk: ByteChunk<Splat = Chunk>>(naive_below: usize, haysta
count
}


/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
///
/// This function is safe to use on any byte array, valid UTF-8 or not,
Expand All @@ -352,39 +276,29 @@ pub fn num_chars(haystack: &[u8]) -> usize {
num_chars_generic::<usize>(32, haystack)
}

/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
///
/// This function is safe to use on any byte array, valid UTF-8 or not,
/// but the output is only meaningful for well-formed UTF-8.
///
/// # Example
///
/// ```
/// let swordfish = "メカジキ";
/// let char_count = bytecount::num_chars(swordfish.as_bytes());
/// assert_eq!(char_count, 4);
/// ```
#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
/// f
#[cfg(feature = "simd-accel")]
pub fn num_chars(haystack: &[u8]) -> usize {
num_chars_generic::<u8x16>(32, haystack)
if haystack.len() < 100 {
naive_num_chars(haystack)
} else {
let mut ret = 0;

for i in 0..haystack.len() / (u8s::WIDTH * 255) {
ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
.simd_iter()
.simd_reduce(u8s(0), u8s(0), |acc, v| {
acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
}).scalar_reduce(0, |acc, s| acc + (s as usize));
}
haystack.len() - ret - (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
.simd_iter()
.simd_reduce(u8s(0), u8s(0), |acc, v| {
acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
}).scalar_reduce(0, |acc, s| acc + (s as usize))
}
}

/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
///
/// This function is safe to use on any byte array, valid UTF-8 or not,
/// but the output is only meaningful for well-formed UTF-8.
///
/// # Example
///
/// ```
/// let swordfish = "メカジキ";
/// let char_count = bytecount::num_chars(swordfish.as_bytes());
/// assert_eq!(char_count, 4);
/// ```
#[cfg(feature = "avx-accel")]
pub fn num_chars(haystack: &[u8]) -> usize {
num_chars_generic::<u8x32>(64, haystack)
}

/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, simple
///
Expand Down