From 032f0bb5e5d5c5d982252f3d986359772eaa3610 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Sat, 27 Jan 2018 15:57:16 -0500
Subject: [PATCH 1/6] Speed up count_hyper and num_chars_hyper

Using `faster` yields a serious speedup over `simd`, and removes the need for
feature segmentation.

faster kabylake
running 15 tests
test bench_count_30000_hyper           ... bench:   1,172 ns/iter (+/- 139)
test bench_count_big_0100000_hyper     ... bench:   3,892 ns/iter (+/- 9)
test bench_count_big_1000000_hyper     ... bench:  38,850 ns/iter (+/- 10,860)
test bench_num_chars_30000_hyper       ... bench:     805 ns/iter (+/- 133)
test bench_num_chars_big_0100000_hyper ... bench:   3,059 ns/iter (+/- 464)
test bench_num_chars_big_1000000_hyper ... bench:  34,522 ns/iter (+/- 2,631)

faster nehalem
test bench_count_30000_hyper           ... bench:   2,478 ns/iter (+/- 328)
test bench_count_big_0100000_hyper     ... bench:   8,079 ns/iter (+/- 3)
test bench_count_big_1000000_hyper     ... bench:  80,784 ns/iter (+/- 24,800)
test bench_num_chars_30000_hyper       ... bench:   1,380 ns/iter (+/- 10)
test bench_num_chars_big_0100000_hyper ... bench:   4,837 ns/iter (+/- 49)
test bench_num_chars_big_1000000_hyper ... bench:  50,925 ns/iter (+/- 7,802)

faster x86-64
test bench_count_30000_hyper           ... bench:   2,347 ns/iter (+/- 2)
test bench_count_big_0100000_hyper     ... bench:   8,325 ns/iter (+/- 2,010)
test bench_count_big_1000000_hyper     ... bench:  83,036 ns/iter (+/- 14,618)
test bench_num_chars_30000_hyper       ... bench:   1,474 ns/iter (+/- 350)
test bench_num_chars_big_0100000_hyper ... bench:   4,695 ns/iter (+/- 56)
test bench_num_chars_big_1000000_hyper ... bench:  51,214 ns/iter (+/- 22,773)

faster pentium
test bench_count_30000_hyper           ... bench:  50,619 ns/iter (+/- 27)
test bench_count_big_0100000_hyper     ... bench: 168,750 ns/iter (+/- 2,620)
test bench_count_big_1000000_hyper     ... bench: 1,793,224 ns/iter (+/- 184,043)
test bench_num_chars_30000_hyper       ... bench:   1,441 ns/iter (+/- 2)
test bench_num_chars_big_0100000_hyper ... bench:   5,045 ns/iter (+/- 79)
test bench_num_chars_big_1000000_hyper ... bench:  52,989 ns/iter (+/- 4,530)

simd kabylake
test bench_count_30000_hyper           ... bench:   1,658 ns/iter (+/- 71)
test bench_count_big_0100000_hyper     ... bench:   5,999 ns/iter (+/- 11)
test bench_count_big_1000000_hyper     ... bench:  61,536 ns/iter (+/- 2,047)
test bench_num_chars_30000_hyper       ... bench:   5,506 ns/iter (+/- 176)
test bench_num_chars_big_0100000_hyper ... bench:  19,045 ns/iter (+/- 2,317)
test bench_num_chars_big_1000000_hyper ... bench: 190,179 ns/iter (+/- 6,082)

simd nehalem
test bench_count_30000_hyper           ... bench:   2,011 ns/iter (+/- 33)
test bench_count_big_0100000_hyper     ... bench:   7,728 ns/iter (+/- 307)
test bench_count_big_1000000_hyper     ... bench:  77,853 ns/iter (+/- 5,531)
test bench_num_chars_30000_hyper       ... bench:     988 ns/iter (+/- 7)
test bench_num_chars_big_0100000_hyper ... bench:   4,137 ns/iter (+/- 528)
test bench_num_chars_big_1000000_hyper ... bench:  45,211 ns/iter (+/- 6,833)

simd x86-64
test bench_count_30000_hyper           ... bench:   2,286 ns/iter (+/- 313)
test bench_count_big_0100000_hyper     ... bench:   7,610 ns/iter (+/- 898)
test bench_count_big_1000000_hyper     ... bench:  79,711 ns/iter (+/- 5,352)
test bench_num_chars_30000_hyper       ... bench:     987 ns/iter (+/- 30)
test bench_num_chars_big_0100000_hyper ... bench:   3,985 ns/iter (+/- 248)
test bench_num_chars_big_1000000_hyper ... bench:  43,328 ns/iter (+/- 1,382)

simd pentium
error[E0432]: unresolved import `x86::sse2`
  --> /home/adam/.cargo/registry/src/github.com-1ecc6299db9ec823/simd-0.2.1/src/common.rs:16:10
   |
16 | use x86::sse2::common;
   |          ^^^^ Could not find `sse2` in `x86`

error: aborting due to previous error

error: Could not compile `simd`.
warning: build failed, waiting for other jobs to finish...
error: build failed
---
 Cargo.toml |   5 +-
 src/lib.rs | 149 +++++++----------------------------------------------
 2 files changed, 22 insertions(+), 132 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ef72107..678f1a9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,11 +15,10 @@ appveyor = { repository = "llogiq/bytecount" }
 bench = false
 
 [features]
-avx-accel = ["simd-accel"]
-simd-accel = ["simd"]
+simd-accel = ["faster"]
 
 [dependencies]
-simd = { version = "0.2.0", optional = true }
+faster = { version = "0.4.2", optional=true }
 
 [dev-dependencies]
 quickcheck = "0.6"
diff --git a/src/lib.rs b/src/lib.rs
index 8a59486..420eb9d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,17 +37,11 @@
 #![deny(missing_docs)]
 
 #[cfg(feature = "simd-accel")]
-extern crate simd;
-
-use core::{cmp, mem, slice, usize};
-
+extern crate faster;
 #[cfg(feature = "simd-accel")]
-use simd::u8x16;
-#[cfg(feature = "avx-accel")]
-use simd::x86::sse2::Sse2U8x16;
-#[cfg(feature = "avx-accel")]
-use simd::x86::avx::{LowHigh128, u8x32};
+use faster::*;
 
+use core::{cmp, mem, ops, slice, usize};
 
 trait ByteChunk: Copy {
     type Splat: Copy;
@@ -100,78 +94,6 @@ impl ByteChunk for usize {
     }
 }
 
-#[cfg(feature = "simd-accel")]
-impl ByteChunk for u8x16 {
-    type Splat = Self;
-
-    fn splat(byte: u8) -> Self {
-        Self::splat(byte)
-    }
-
-    fn from_splat(splat: Self) -> Self {
-        splat
-    }
-
-    fn is_leading_utf8_byte(self) -> Self {
-        (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
-    }
-
-    fn bytewise_equal(self, other: Self) -> Self {
-        self.eq(other).to_repr().to_u8()
-    }
-
-    fn increment(self, incr: Self) -> Self {
-        // incr on -1
-        self - incr
-    }
-
-    fn sum(&self) -> usize {
-        let mut count = 0;
-        for i in 0..16 {
-            count += self.extract(i) as usize;
-        }
-        count
-    }
-}
-
-#[cfg(feature = "avx-accel")]
-impl ByteChunk for u8x32 {
-    type Splat = Self;
-
-    fn splat(byte: u8) -> Self {
-        Self::splat(byte)
-    }
-
-    fn from_splat(splat: Self) -> Self {
-        splat
-    }
-
-    fn is_leading_utf8_byte(self) -> Self {
-        (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
-    }
-
-    fn bytewise_equal(self, other: Self) -> Self {
-        self.eq(other).to_repr().to_u8()
-    }
-
-    fn increment(self, incr: Self) -> Self {
-        // incr on -1
-        self - incr
-    }
-
-    fn sum(&self) -> usize {
-        let zero = u8x16::splat(0);
-        let sad_lo = self.low().sad(zero);
-        let sad_hi = self.high().sad(zero);
-
-        let mut count = 0;
-        count += (sad_lo.extract(0) + sad_lo.extract(1)) as usize;
-        count += (sad_hi.extract(0) + sad_hi.extract(1)) as usize;
-        count
-    }
-}
-
-
 fn chunk_align<Chunk: ByteChunk>(x: &[u8]) -> (&[u8], &[Chunk], &[u8]) {
     let align = mem::size_of::<Chunk>();
 
@@ -254,23 +176,21 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
 /// let number_of_spaces = bytecount::count(s, b' ');
 /// assert_eq!(number_of_spaces, 5);
 /// ```
-#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
-pub fn count(haystack: &[u8], needle: u8) -> usize {
-    count_generic::<u8x16>(32, haystack, needle)
-}
-
-/// Count occurrences of a byte in a slice of bytes, fast
-///
-/// # Examples
-///
-/// ```
-/// let s = b"This is a Text with spaces";
-/// let number_of_spaces = bytecount::count(s, b' ');
-/// assert_eq!(number_of_spaces, 5);
-/// ```
-#[cfg(feature = "avx-accel")]
+#[cfg(feature = "simd-accel")]
 pub fn count(haystack: &[u8], needle: u8) -> usize {
-    count_generic::<u8x32>(64, haystack, needle)
+    let mut ret: usize = 0;
+    let mut i = 0;
+    let mut acc = u8s(0);
+    haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| {
+        i += 1;
+        acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01));
+        if i == 255 {
+            ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize));
+            acc = u8s(0);
+            i = 0;
+        }
+    });
+    ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize))
 }
 
 /// Count up to `(2^32)-1` occurrences of a byte in a slice
@@ -333,7 +253,6 @@ fn num_chars_generic<Chunk: ByteChunk<Splat = Chunk>>(naive_below: usize, haysta
     count
 }
 
-
 /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
 ///
 /// This function is safe to use on any byte array, valid UTF-8 or not,
@@ -346,46 +265,18 @@ fn num_chars_generic<Chunk: ByteChunk<Splat = Chunk>>(naive_below: usize, haysta
 /// let char_count = bytecount::num_chars(swordfish.as_bytes());
 /// assert_eq!(char_count, 4);
 /// ```
-#[cfg(not(feature = "simd-accel"))]
 pub fn num_chars(haystack: &[u8]) -> usize {
     // Never use [usize; 4]
     num_chars_generic::<usize>(32, haystack)
 }
 
-/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
-///
-/// This function is safe to use on any byte array, valid UTF-8 or not,
-/// but the output is only meaningful for well-formed UTF-8.
-///
-/// # Example
-///
-/// ```
-/// let swordfish = "メカジキ";
-/// let char_count = bytecount::num_chars(swordfish.as_bytes());
-/// assert_eq!(char_count, 4);
-/// ```
-#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
+#[cfg(feature = "simd-accel")]
 pub fn num_chars(haystack: &[u8]) -> usize {
-    num_chars_generic::<u8x16>(32, haystack)
-}
 
-/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
-///
-/// This function is safe to use on any byte array, valid UTF-8 or not,
-/// but the output is only meaningful for well-formed UTF-8.
-///
-/// # Example
-///
-/// ```
-/// let swordfish = "メカジキ";
-/// let char_count = bytecount::num_chars(swordfish.as_bytes());
-/// assert_eq!(char_count, 4);
-/// ```
-#[cfg(feature = "avx-accel")]
-pub fn num_chars(haystack: &[u8]) -> usize {
-    num_chars_generic::<u8x32>(64, haystack)
+
 }
 
+
 /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, simple
 ///
 /// This function is safe to use on any byte array, valid UTF-8 or not,

From ecd5edffc62bff95ea1d226343b2cd42c13b985a Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Sat, 27 Jan 2018 16:44:32 -0500
Subject: [PATCH 2/6] Use naive method for small slices

Even without faster, the "hyper" method of counting is slower for slices with a
small size.
---
 Cargo.toml |  2 +-
 src/lib.rs | 30 +++++++++++++++++-------------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 678f1a9..1b29a72 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ bench = false
 simd-accel = ["faster"]
 
 [dependencies]
-faster = { version = "0.4.2", optional=true }
+faster = { version = "0.4.2", optional = true }
 
 [dev-dependencies]
 quickcheck = "0.6"
diff --git a/src/lib.rs b/src/lib.rs
index 420eb9d..03a73a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -178,19 +178,23 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
 /// ```
 #[cfg(feature = "simd-accel")]
 pub fn count(haystack: &[u8], needle: u8) -> usize {
-    let mut ret: usize = 0;
-    let mut i = 0;
-    let mut acc = u8s(0);
-    haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| {
-        i += 1;
-        acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01));
-        if i == 255 {
-            ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize));
-            acc = u8s(0);
-            i = 0;
-        }
-    });
-    ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize))
+    if haystack.len() < 100 {
+        naive_count(haystack, needle)
+    } else {
+        let mut ret = 0;
+        let mut i = 0;
+        let mut acc = u8s(0);
+        haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| {
+            i += 1;
+            acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01));
+            if i == 255 {
+                ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize));
+                acc = u8s(0);
+                i = 0;
+            }
+        });
+        ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize))
+    }
 }
 
 /// Count up to `(2^32)-1` occurrences of a byte in a slice

From 1fc04c6992acaf3e75a7acf8f00b5a04c6795253 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Sun, 28 Jan 2018 15:53:53 -0500
Subject: [PATCH 3/6] Update faster, README, tests

---
 .travis.yml |  9 +++++++--
 Cargo.toml  |  2 +-
 README.md   | 17 +++++------------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 0f166dc..54046f1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,8 +30,13 @@ matrix:
     - rust: nightly
       env:
         - ARCH=x86_64
-        - FEATURES="--features avx-accel"
-        - RUSTFLAGS="-C target-feature=+avx"
+        - FEATURES="--features simd-accel"
+        - RUSTFLAGS="-C target-feature=+sse4.2"
+    - rust: nightly
+      env:
+        - ARCH=x86_64
+        - FEATURES="--features simd-accel"
+        - RUSTFLAGS="-C target-cpu=native"
 addons:
   apt:
     packages:
diff --git a/Cargo.toml b/Cargo.toml
index 1b29a72..a921ee6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ bench = false
 simd-accel = ["faster"]
 
 [dependencies]
-faster = { version = "0.4.2", optional = true }
+faster = { version = "0.4.3", optional = true }
 
 [dev-dependencies]
 quickcheck = "0.6"
diff --git a/README.md b/README.md
index a5b09ef..c94fee6 100644
--- a/README.md
+++ b/README.md
@@ -28,28 +28,21 @@ fn main() {
 }
 ```
 
-bytecount supports two features to make use of modern CPU's features to speed up counting considerably. To allow your
-users to use them, add the following to your `Cargo.toml`:
+bytecount makes use of features in modern CPUs to speed up counting considerably. To use these features,
+add the following to your `Cargo.toml`:
 
 ```
 [features]
-avx-accel = ["bytecount/avx-accel"]
 simd-accel = ["bytecount/simd-accel"]
 ```
 
-Now your users can compile with SSE support (available on most modern x86_64 processors) using:
+Now your users can compile with SIMD support, regardless of processor type, using:
 
 ```
-cargo build --release --features simd-accel
+RUSTFLAGS="-C target-cpu=native" cargo build --release --features simd-accel
 ```
 
-Or even with AVX support (which likely requires compiling for the native target CPU):
-
-```
-RUSTFLAGS="-C target-cpu=native" cargo build --release --features "simd-accel avx-accel"
-```
-
-The algorithm is explained in depth
+The scalar algorithm is explained in depth
 [here](https://llogiq.github.io/2016/09/27/count.html).
 
 Note that for very short slices, the data parallelism will likely not win much performance gains. In those cases, a naive

From e807753b9f4e16eec5b2c4845eb5373f1e286292 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Tue, 30 Jan 2018 15:56:14 -0500
Subject: [PATCH 4/6] Update core SIMD algorithm

Takes us from 30kns -> 18.9kns
---
 src/lib.rs | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 03a73a0..11363ed 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -182,18 +182,19 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
         naive_count(haystack, needle)
     } else {
         let mut ret = 0;
-        let mut i = 0;
-        let mut acc = u8s(0);
-        haystack.simd_iter().simd_for_each(u8s(needle.overflowing_add(1).0), |v| {
-            i += 1;
-            acc += (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01));
-            if i == 255 {
-                ret += acc.scalar_reduce(0, |acc, s| acc + (s as usize));
-                acc = u8s(0);
-                i = 0;
-            }
-        });
-        ret + acc.scalar_reduce(0, |acc, s| acc + (s as usize))
+
+        for i in 0..haystack.len() / (u8s::WIDTH * 255) {
+            ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
+                .simd_iter()
+                .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
+                    acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
+                }).scalar_reduce(0, |acc, s| acc + (s as usize));
+        }
+        ret + (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
+            .simd_iter()
+            .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
+                acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
+            }).scalar_reduce(0, |acc, s| acc + (s as usize))
     }
 }
 

From bcd1902a6557a2d82942571c34e448acb849d441 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Tue, 30 Jan 2018 16:51:10 -0500
Subject: [PATCH 5/6] Fix appveyor?

---
 appveyor.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 5f861bd..27056cc 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -15,8 +15,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: i686-pc-windows-gnu
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: i686-pc-windows-msvc
           CHANNEL: stable
         - TARGET: i686-pc-windows-msvc
@@ -28,8 +28,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: i686-pc-windows-msvc
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: x86_64-pc-windows-gnu
           CHANNEL: stable
         - TARGET: x86_64-pc-windows-gnu
@@ -41,8 +41,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: x86_64-pc-windows-gnu
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: x86_64-pc-windows-msvc
           CHANNEL: stable
         - TARGET: x86_64-pc-windows-msvc
@@ -54,8 +54,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: x86_64-pc-windows-msvc
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
 
 install:
     - curl -sSf -o rustup-init.exe https://win.rustup.rs/

From ccaa4583961490d9f9558c1389462d383a7c6013 Mon Sep 17 00:00:00 2001
From: Adam Niederer <adam.niederer@gmail.com>
Date: Tue, 30 Jan 2018 17:21:05 -0500
Subject: [PATCH 6/6] Add faster-accelerated num_chars

---
 src/lib.rs | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 11363ed..62b3bfb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -41,7 +41,7 @@ extern crate faster;
 #[cfg(feature = "simd-accel")]
 use faster::*;
 
-use core::{cmp, mem, ops, slice, usize};
+use core::{cmp, mem, slice, usize};
 
 trait ByteChunk: Copy {
     type Splat: Copy;
@@ -270,15 +270,33 @@ fn num_chars_generic<Chunk: ByteChunk<Splat = Chunk>>(naive_below: usize, haysta
 /// let char_count = bytecount::num_chars(swordfish.as_bytes());
 /// assert_eq!(char_count, 4);
 /// ```
+#[cfg(not(feature = "simd-accel"))]
 pub fn num_chars(haystack: &[u8]) -> usize {
     // Never use [usize; 4]
     num_chars_generic::<usize>(32, haystack)
 }
 
+/// f
 #[cfg(feature = "simd-accel")]
 pub fn num_chars(haystack: &[u8]) -> usize {
+    if haystack.len() < 100 {
+        naive_num_chars(haystack)
+    } else {
+        let mut ret = 0;
 
-
+        for i in 0..haystack.len() / (u8s::WIDTH * 255) {
+            ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
+                .simd_iter()
+                .simd_reduce(u8s(0), u8s(0), |acc, v| {
+                    acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
+                }).scalar_reduce(0, |acc, s| acc + (s as usize));
+        }
+        haystack.len() - ret - (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
+            .simd_iter()
+            .simd_reduce(u8s(0), u8s(0), |acc, v| {
+                acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
+            }).scalar_reduce(0, |acc, s| acc + (s as usize))
+    }
 }