diff --git a/.travis.yml b/.travis.yml
index 0f166dc..54046f1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,8 +30,13 @@ matrix:
     - rust: nightly
       env:
         - ARCH=x86_64
-        - FEATURES="--features avx-accel"
-        - RUSTFLAGS="-C target-feature=+avx"
+        - FEATURES="--features simd-accel"
+        - RUSTFLAGS="-C target-feature=+sse4.2"
+    - rust: nightly
+      env:
+        - ARCH=x86_64
+        - FEATURES="--features simd-accel"
+        - RUSTFLAGS="-C target-cpu=native"
 addons:
   apt:
     packages:
diff --git a/Cargo.toml b/Cargo.toml
index ef72107..a921ee6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,11 +15,10 @@ appveyor = { repository = "llogiq/bytecount" }
 bench = false
 
 [features]
-avx-accel = ["simd-accel"]
-simd-accel = ["simd"]
+simd-accel = ["faster"]
 
 [dependencies]
-simd = { version = "0.2.0", optional = true }
+faster = { version = "0.4.3", optional = true }
 
 [dev-dependencies]
 quickcheck = "0.6"
diff --git a/README.md b/README.md
index a5b09ef..c94fee6 100644
--- a/README.md
+++ b/README.md
@@ -28,28 +28,21 @@ fn main() {
 }
 ```
 
-bytecount supports two features to make use of modern CPU's features to speed up counting considerably. To allow your
-users to use them, add the following to your `Cargo.toml`:
+bytecount makes use of features in modern CPUs to speed up counting considerably. To use these features,
+add the following to your `Cargo.toml`:
 
 ```
 [features]
-avx-accel = ["bytecount/avx-accel"]
 simd-accel = ["bytecount/simd-accel"]
 ```
 
-Now your users can compile with SSE support (available on most modern x86_64 processors) using:
+Now your users can compile with SIMD support, regardless of processor type, using:
 
 ```
-cargo build --release --features simd-accel
+RUSTFLAGS="-C target-cpu=native" cargo build --release --features simd-accel
 ```
 
-Or even with AVX support (which likely requires compiling for the native target CPU):
-
-```
-RUSTFLAGS="-C target-cpu=native" cargo build --release --features "simd-accel avx-accel"
-```
-
-The algorithm is explained in depth
+The scalar algorithm is explained in depth
 [here](https://llogiq.github.io/2016/09/27/count.html).
 
 Note that for very short slices, the data parallelism will likely not win much performance gains. In those cases, a naive
diff --git a/appveyor.yml b/appveyor.yml
index 5f861bd..27056cc 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -15,8 +15,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: i686-pc-windows-gnu
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: i686-pc-windows-msvc
           CHANNEL: stable
         - TARGET: i686-pc-windows-msvc
@@ -28,8 +28,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: i686-pc-windows-msvc
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: x86_64-pc-windows-gnu
           CHANNEL: stable
         - TARGET: x86_64-pc-windows-gnu
@@ -41,8 +41,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: x86_64-pc-windows-gnu
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
         - TARGET: x86_64-pc-windows-msvc
           CHANNEL: stable
         - TARGET: x86_64-pc-windows-msvc
@@ -54,8 +54,8 @@ environment:
           FEATURES: "--features simd-accel"
         - TARGET: x86_64-pc-windows-msvc
           CHANNEL: nightly
-          FEATURES: "--features avx-accel"
-          RUSTFLAGS: "-C target-feature=+avx"
+          FEATURES: "--features simd-accel"
+          RUSTFLAGS: "-C target-cpu=native"
 
 install:
     - curl -sSf -o rustup-init.exe https://win.rustup.rs/
diff --git a/src/lib.rs b/src/lib.rs
index 8a59486..62b3bfb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -37,17 +37,11 @@
 #![deny(missing_docs)]
 
 #[cfg(feature = "simd-accel")]
-extern crate simd;
-
-use core::{cmp, mem, slice, usize};
-
+extern crate faster;
 #[cfg(feature = "simd-accel")]
-use simd::u8x16;
-#[cfg(feature = "avx-accel")]
-use simd::x86::sse2::Sse2U8x16;
-#[cfg(feature = "avx-accel")]
-use simd::x86::avx::{LowHigh128, u8x32};
+use faster::*;
 
+use core::{cmp, mem, slice, usize};
 
 trait ByteChunk: Copy {
     type Splat: Copy;
@@ -100,78 +94,6 @@ impl ByteChunk for usize {
     }
 }
 
-#[cfg(feature = "simd-accel")]
-impl ByteChunk for u8x16 {
-    type Splat = Self;
-
-    fn splat(byte: u8) -> Self {
-        Self::splat(byte)
-    }
-
-    fn from_splat(splat: Self) -> Self {
-        splat
-    }
-
-    fn is_leading_utf8_byte(self) -> Self {
-        (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
-    }
-
-    fn bytewise_equal(self, other: Self) -> Self {
-        self.eq(other).to_repr().to_u8()
-    }
-
-    fn increment(self, incr: Self) -> Self {
-        // incr on -1
-        self - incr
-    }
-
-    fn sum(&self) -> usize {
-        let mut count = 0;
-        for i in 0..16 {
-            count += self.extract(i) as usize;
-        }
-        count
-    }
-}
-
-#[cfg(feature = "avx-accel")]
-impl ByteChunk for u8x32 {
-    type Splat = Self;
-
-    fn splat(byte: u8) -> Self {
-        Self::splat(byte)
-    }
-
-    fn from_splat(splat: Self) -> Self {
-        splat
-    }
-
-    fn is_leading_utf8_byte(self) -> Self {
-        (self & Self::splat(0b1100_0000)).ne(Self::splat(0b1000_0000)).to_repr().to_u8()
-    }
-
-    fn bytewise_equal(self, other: Self) -> Self {
-        self.eq(other).to_repr().to_u8()
-    }
-
-    fn increment(self, incr: Self) -> Self {
-        // incr on -1
-        self - incr
-    }
-
-    fn sum(&self) -> usize {
-        let zero = u8x16::splat(0);
-        let sad_lo = self.low().sad(zero);
-        let sad_hi = self.high().sad(zero);
-
-        let mut count = 0;
-        count += (sad_lo.extract(0) + sad_lo.extract(1)) as usize;
-        count += (sad_hi.extract(0) + sad_hi.extract(1)) as usize;
-        count
-    }
-}
-
-
 fn chunk_align<Chunk: ByteChunk>(x: &[u8]) -> (&[u8], &[Chunk], &[u8]) {
     let align = mem::size_of::<Chunk>();
 
@@ -254,23 +176,26 @@ pub fn count(haystack: &[u8], needle: u8) -> usize {
 /// let number_of_spaces = bytecount::count(s, b' ');
 /// assert_eq!(number_of_spaces, 5);
 /// ```
-#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
-pub fn count(haystack: &[u8], needle: u8) -> usize {
-    count_generic::<u8x16>(32, haystack, needle)
-}
-
-/// Count occurrences of a byte in a slice of bytes, fast
-///
-/// # Examples
-///
-/// ```
-/// let s = b"This is a Text with spaces";
-/// let number_of_spaces = bytecount::count(s, b' ');
-/// assert_eq!(number_of_spaces, 5);
-/// ```
-#[cfg(feature = "avx-accel")]
+#[cfg(feature = "simd-accel")]
 pub fn count(haystack: &[u8], needle: u8) -> usize {
-    count_generic::<u8x32>(64, haystack, needle)
+    if haystack.len() < 100 {
+        naive_count(haystack, needle)
+    } else {
+        let mut ret = 0;
+
+        for i in 0..haystack.len() / (u8s::WIDTH * 255) {
+            ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
+                .simd_iter()
+                .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
+                    acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
+                }).scalar_reduce(0, |acc, s| acc + (s as usize));
+        }
+        ret + (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
+            .simd_iter()
+            .simd_reduce(u8s(0), u8s(needle.overflowing_add(1).0), |acc, v| {
+                acc + (PackedEq::eq(&v, &u8s(needle)).be_u8s() & u8s(0x01))
+            }).scalar_reduce(0, |acc, s| acc + (s as usize))
+    }
 }
 
 /// Count up to `(2^32)-1` occurrences of a byte in a slice
@@ -333,7 +258,6 @@ fn num_chars_generic<Chunk: ByteChunk<Splat = Chunk>>(naive_below: usize, haysta
     count
 }
 
-
 /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
 ///
 /// This function is safe to use on any byte array, valid UTF-8 or not,
@@ -352,39 +276,29 @@ pub fn num_chars(haystack: &[u8]) -> usize {
     num_chars_generic::<usize>(32, haystack)
 }
 
-/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
-///
-/// This function is safe to use on any byte array, valid UTF-8 or not,
-/// but the output is only meaningful for well-formed UTF-8.
-///
-/// # Example
-///
-/// ```
-/// let swordfish = "メカジキ";
-/// let char_count = bytecount::num_chars(swordfish.as_bytes());
-/// assert_eq!(char_count, 4);
-/// ```
-#[cfg(all(feature = "simd-accel", not(feature = "avx-accel")))]
+/// f
+#[cfg(feature = "simd-accel")]
 pub fn num_chars(haystack: &[u8]) -> usize {
-    num_chars_generic::<u8x16>(32, haystack)
+    if haystack.len() < 100 {
+        naive_num_chars(haystack)
+    } else {
+        let mut ret = 0;
+
+        for i in 0..haystack.len() / (u8s::WIDTH * 255) {
+            ret += (&haystack[i * u8s::WIDTH * 255..(i + 1) * u8s::WIDTH * 255])
+                .simd_iter()
+                .simd_reduce(u8s(0), u8s(0), |acc, v| {
+                    acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
+                }).scalar_reduce(0, |acc, s| acc + (s as usize));
+        }
+        haystack.len() - ret - (&haystack[haystack.len() - haystack.len() % (u8s::WIDTH * 255)..])
+            .simd_iter()
+            .simd_reduce(u8s(0), u8s(0), |acc, v| {
+                acc + (PackedEq::eq(&(v & u8s(0xC0)), &u8s(0x80)).be_u8s() & u8s(0x01))
+            }).scalar_reduce(0, |acc, s| acc + (s as usize))
+    }
 }
 
-/// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, fast
-///
-/// This function is safe to use on any byte array, valid UTF-8 or not,
-/// but the output is only meaningful for well-formed UTF-8.
-///
-/// # Example
-///
-/// ```
-/// let swordfish = "メカジキ";
-/// let char_count = bytecount::num_chars(swordfish.as_bytes());
-/// assert_eq!(char_count, 4);
-/// ```
-#[cfg(feature = "avx-accel")]
-pub fn num_chars(haystack: &[u8]) -> usize {
-    num_chars_generic::<u8x32>(64, haystack)
-}
 
 /// Count the number of UTF-8 encoded unicode codepoints in a slice of bytes, simple
 ///