diff --git a/.travis.yml b/.travis.yml
index 45292f0bd2..a0ebd95ae4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -55,6 +55,7 @@ matrix:
         cargo clippy --all -- -D clippy-pedantic
   allow_failures:
     - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
+    - env: TARGET=wasm32-unknown-unknown
 
 before_install:
   # FIXME (travis-ci/travis-ci#8920) shouldn't be necessary...
diff --git a/ci/run.sh b/ci/run.sh
index 3ea3ae08b5..5ca8bbdd96 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -35,3 +35,14 @@ cargo_test() {
 
 cargo_test
 cargo_test "--release"
+
+# Test x86 targets compiled with AVX.
+case ${TARGET} in
+    x86*)
+        RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
+        export STDSIMD_DISABLE_ASSERT_INSTR=1
+        cargo_test "--release"
+        ;;
+    *)
+        ;;
+esac
diff --git a/coresimd/macros.rs b/coresimd/macros.rs
index 343f425c1a..fa96f50c81 100644
--- a/coresimd/macros.rs
+++ b/coresimd/macros.rs
@@ -13,3 +13,44 @@ macro_rules! types {
         pub struct $name($($fields)*);
     )*)
 }
+
+macro_rules! cfg_if {
+    ($(
+        if #[cfg($($meta:meta),*)] { $($it:item)* }
+    ) else * else {
+        $($it2:item)*
+    }) => {
+        __cfg_if_items! {
+            () ;
+            $( ( ($($meta),*) ($($it)*) ), )*
+            ( () ($($it2)*) ),
+        }
+    };
+    (
+        if #[cfg($($i_met:meta),*)] { $($i_it:item)* }
+        $(
+            else if #[cfg($($e_met:meta),*)] { $($e_it:item)* }
+        )*
+    ) => {
+        __cfg_if_items! {
+            () ;
+            ( ($($i_met),*) ($($i_it)*) ),
+            $( ( ($($e_met),*) ($($e_it)*) ), )*
+            ( () () ),
+        }
+    }
+}
+
+macro_rules! __cfg_if_items {
+    (($($not:meta,)*) ; ) => {};
+    (($($not:meta,)*) ; ( ($($m:meta),*) ($($it:item)*) ), $($rest:tt)*) => {
+        __cfg_if_apply! { cfg(all($($m,)* not(any($($not),*)))), $($it)* }
+        __cfg_if_items! { ($($not,)* $($m,)*) ; $($rest)* }
+    }
+}
+
+macro_rules! __cfg_if_apply {
+    ($m:meta, $($it:item)*) => {
+        $(#[$m] $it)*
+    }
+}
diff --git a/coresimd/ppsv/api/masks_reductions.rs b/coresimd/ppsv/api/masks_reductions.rs
index e348a42d7b..bc7ac36d34 100644
--- a/coresimd/ppsv/api/masks_reductions.rs
+++ b/coresimd/ppsv/api/masks_reductions.rs
@@ -5,37 +5,15 @@ macro_rules! impl_mask_reductions {
     ($id:ident) => {
         impl $id {
             /// Are `all` vector lanes `true`?
-            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             pub fn all(self) -> bool {
-                use coresimd::simd_llvm::simd_reduce_all;
-                unsafe { simd_reduce_all(self) }
+                unsafe { super::codegen::masks_reductions::All::all(self) }
             }
-            /// Are `all` vector lanes `true`?
-            #[cfg(target_arch = "aarch64")]
-            #[inline]
-            pub fn all(self) -> bool {
-                // FIXME: Broken on AArch64
-                // https://bugs.llvm.org/show_bug.cgi?id=36796
-                self.and()
-            }
-
             /// Is `any` vector lanes `true`?
-            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             pub fn any(self) -> bool {
-                use coresimd::simd_llvm::simd_reduce_any;
-                unsafe { simd_reduce_any(self) }
+                unsafe { super::codegen::masks_reductions::Any::any(self) }
             }
-            /// Is `any` vector lanes `true`?
-            #[cfg(target_arch = "aarch64")]
-            #[inline]
-            pub fn any(self) -> bool {
-                // FIXME: Broken on AArch64
-                // https://bugs.llvm.org/show_bug.cgi?id=36796
-                self.or()
-            }
-
             /// Are `all` vector lanes `false`?
             #[inline]
             pub fn none(self) -> bool {
diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs
new file mode 100644
index 0000000000..b06c2d0a29
--- /dev/null
+++ b/coresimd/ppsv/codegen/masks_reductions.rs
@@ -0,0 +1,604 @@
+//! LLVM6 currently generates sub-optimal code for the `all` mask reductions.
+//!
+//! See https://github.com/rust-lang-nursery/stdsimd/issues/362#issuecomment-372774371
+//! and the associated LLVM bug:
+//! https://bugs.llvm.org/show_bug.cgi?id=36702
+
+#![allow(unused)]
+
+use coresimd::simd::*;
+
+pub trait All: ::marker::Sized {
+    unsafe fn all(self) -> bool;
+}
+
+pub trait Any: ::marker::Sized {
+    unsafe fn any(self) -> bool;
+}
+
+// By default we use the simd_reduce_{all,any} intrinsics, which produces
+// sub-optimal code, except on aarch64 where that intrinsic is broken
+// due to https://bugs.llvm.org/show_bug.cgi?id=36796 so we just use
+// full-blown bitwise and/or reduction there.
+macro_rules! default_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                #[cfg(not(target_arch = "aarch64"))] {
+                    use coresimd::simd_llvm::simd_reduce_all;
+                    simd_reduce_all(self)
+                }
+                #[cfg(target_arch = "aarch64")] {
+                    // FIXME: Broken on AArch64
+                    // https://bugs.llvm.org/show_bug.cgi?id=36796
+                    self.and()
+                }
+            }
+        }
+
+        impl Any for $id {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                #[cfg(not(target_arch = "aarch64"))] {
+                    use coresimd::simd_llvm::simd_reduce_any;
+                    simd_reduce_any(self)
+                }
+                #[cfg(target_arch = "aarch64")] {
+                    // FIXME: Broken on AArch64
+                    // https://bugs.llvm.org/show_bug.cgi?id=36796
+                    self.or()
+                }
+            }
+        }
+    };
+}
+
+// On x86 both SSE2 and AVX2 provide movemask instructions that can be used
+// here. The AVX2 instructions aren't necessarily better than the AVX
+// instructions below, so they aren't implemented here.
+//
+// FIXME: for mask generated from f32x4 LLVM6 emits pmovmskb but should emit
+// movmskps. Since the masks don't track whether they were produced by integer
+// or floating point vectors, we can't currently work around this yet. The
+// performance impact for this shouldn't be large, but this is filled as:
+// https://bugs.llvm.org/show_bug.cgi?id=37087
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+macro_rules! x86_128_sse2_movemask_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+                // _mm_movemask_epi8(a) creates a 16bit mask containing the most
+                // significant bit of each byte of `a`. If all bits are set,
+                // then all 16 lanes of the mask are true.
+                _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+
+                _mm_movemask_epi8(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256.
+//
+// FIXME: for masks generated from floating point vectors one should use
+// x86_mm256_testc_ps, x86_mm256_testz_ps, x86_mm256_testc_pd,
+// x86_mm256_testz_pd.Since the masks don't track whether they were produced by
+// integer or floating point vectors, we can't currently work around this yet.
+//
+// TODO: investigate perf impact and fill LLVM bugs as necessary.
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))]
+macro_rules! x86_256_avx_test_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm256_testc_si256;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm256_testc_si256;
+                _mm256_testc_si256(::mem::transmute(self),
+                                   ::mem::transmute($id::splat(true))) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm256_testz_si256;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm256_testz_si256;
+                _mm256_testz_si256(::mem::transmute(self),
+                                   ::mem::transmute(self)) == 0
+            }
+        }
+    }
+}
+
+// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing
+// the algorithm for 128-bit on the higher and lower elements of the vector
+// independently.
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+macro_rules! x86_256_sse2_impl {
+    ($id:ident, $v128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                unsafe {
+                    union U {
+                        halves: ($v128, $v128),
+                        vec: $id
+                    }
+                    let halves = U {vec: self}.halves;
+                    halves.0.all() && halves.1.all()
+                }
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                unsafe {
+                    union U {
+                        halves: ($v128, $v128),
+                        vec: $id
+                    }
+                    let halves = U {vec: self}.halves;
+                    halves.0.any() || halves.1.any()
+                }
+            }
+        }
+    }
+}
+
+// Implementation for 64-bit wide masks on x86.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_64_mmx_movemask_impl {
+    ($id:ident, $vec128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "mmx")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                // _mm_movemask_pi8(a) creates an 8bit mask containing the most
+                // significant bit of each byte of `a`. If all bits are set,
+                // then all 8 lanes of the mask are true.
+                 _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "mmx")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+
+                _mm_movemask_pi8(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// Implementation for 128-bit wide masks on x86
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_128_impl {
+    ($id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "sse2")] {
+                x86_128_sse2_movemask_impl!($id);
+            }  else {
+                default_impl!($id);
+            }
+        }
+    }
+}
+
+// Implementation for 256-bit wide masks on x86
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_256_impl {
+    ($id:ident, $half_id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "avx")] {
+                x86_256_avx_test_impl!($id);
+            } else if #[cfg(target_feature = "sse2")] {
+                x86_256_sse2_impl!($id, $half_id);
+            } else {
+                default_impl!($id);
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x2_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // pmin((a, b), (-,-)) => (b, -).0 => b
+                let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // pmax((a, b), (-,-)) => (b, -).0 => b
+                let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x4_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
+                let tmp = $vpmin(transmute(self), ::mem::uninitialized());
+                // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
+                let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
+                let tmp =  $vpmax(transmute(self), ::mem::uninitialized());
+                // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
+                let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x8_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // tmp = pmin(
+                //     (a, b, c, d, e, f, g, h),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (a, c, e, g, -, -, -, -)
+                let tmp = $vpmin(transmute(self), ::mem::uninitialized());
+                // tmp = pmin(
+                //     (a, c, e, g, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (c, g, -, -, -, -, -, -)
+                let tmp = $vpmin(tmp, ::mem::uninitialized());
+                // tmp = pmin(
+                //     (c, g, -, -, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (g, -, -, -, -, -, -, -).0 => g
+                let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // tmp = pmax(
+                //     (a, b, c, d, e, f, g, h),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (a, c, e, g, -, -, -, -)
+                let tmp = $vpmax(transmute(self), ::mem::uninitialized());
+                // tmp = pmax(
+                //     (a, c, e, g, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (c, g, -, -, -, -, -, -)
+                let tmp = $vpmax(tmp, ::mem::uninitialized());
+                // tmp = pmax(
+                //     (c, g, -, -, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (g, -, -, -, -, -, -, -).0 => g
+                let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with
+// more than two elements.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_128_v7_neon_impl {
+    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
+                h.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
+                h.any()
+            }
+        }
+    }
+}
+
+// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
+// min/max) for 128-bit wide vectors.
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+macro_rules! aarch64_128_neon_impl {
+    ($id:ident, $vmin:ident, $vmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::aarch64::$vmin;
+                $vmin(::mem::transmute(self)) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::aarch64::$vmax;
+                $vmax(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
+// min/max) for 64-bit wide vectors.
+//
+// This impl duplicates the 64-bit vector into a 128-bit one and calls
+// all/any on that.
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+macro_rules! aarch64_64_neon_impl {
+    ($id:ident, $vec128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128
+                }
+                U { halves: (self, self) }.vec.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128
+                }
+                U { halves: (self, self) }.vec.any()
+            }
+        }
+    }
+}
+
+macro_rules! impl_mask_all_any {
+    // 64-bit wide masks
+    (m8x8) => {
+        cfg_if! {
+            if #[cfg(target_arch = "x86_64")] {
+                x86_64_mmx_movemask_impl!(m8x8, m8x16);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x8_v7_neon_impl!(m8x8, vpmin_u8, vpmax_u8);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m8x8, m8x16);
+            } else {
+                default_impl!(m8x8);
+            }
+        }
+    };
+    (m16x4) => {
+        cfg_if! {
+            if #[cfg(target_arch = "x86_64")] {
+                x86_64_mmx_movemask_impl!(m16x4, m16x8);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x4_v7_neon_impl!(m16x4, vpmin_u16, vpmax_u16);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m16x4, m16x8);
+            } else {
+                default_impl!(m16x4);
+            }
+        }
+    };
+    (m32x2) => {
+        cfg_if! {
+            if #[cfg(all(target_arch = "x86_64", not(target_os = "macos")))] {
+                // FIXME: this fails on travis-ci osx build bots.
+                x86_64_mmx_movemask_impl!(m32x2, m32x4);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x2_v7_neon_impl!(m32x2, vpmin_u32, vpmax_u32);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m32x2, m32x4);
+            } else {
+                default_impl!(m32x2);
+            }
+        }
+    };
+    // 128-bit wide masks
+    (m8x16) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m8x16);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8);
+            } else {
+                default_impl!(m8x16);
+            }
+        }
+    };
+    (m16x8) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m16x8);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16);
+            } else {
+                default_impl!(m16x8);
+            }
+        }
+    };
+    (m32x4) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m32x4);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32);
+            } else {
+                default_impl!(m32x4);
+            }
+        }
+    };
+    (m64x2) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m64x2);
+            } else {
+                default_impl!(m64x2);
+            }
+        }
+    };
+    // 256-bit wide masks:
+    (m8x32) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m8x32, m8x16);
+            } else {
+                default_impl!(m8x32);
+            }
+        }
+    };
+    (m16x16) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m16x16, m16x8);
+            } else {
+                default_impl!(m16x16);
+            }
+        }
+    };
+    (m32x8) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m32x8, m32x4);
+            } else {
+                default_impl!(m32x8);
+            }
+        }
+    };
+    (m64x4) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m64x4, m64x2);
+            } else {
+                default_impl!(m64x4);
+            }
+        }
+    };
+    // Fallback to LLVM's default code-generation:
+    ($id:ident) => { default_impl!($id); };
+}
+
+vector_impl!(
+    [impl_mask_all_any, m1x8],
+    [impl_mask_all_any, m1x16],
+    [impl_mask_all_any, m1x32],
+    [impl_mask_all_any, m1x64],
+    [impl_mask_all_any, m8x2],
+    [impl_mask_all_any, m8x4],
+    [impl_mask_all_any, m8x8],
+    [impl_mask_all_any, m8x16],
+    [impl_mask_all_any, m8x32],
+    [impl_mask_all_any, m16x2],
+    [impl_mask_all_any, m16x4],
+    [impl_mask_all_any, m16x8],
+    [impl_mask_all_any, m16x16],
+    [impl_mask_all_any, m32x2],
+    [impl_mask_all_any, m32x4],
+    [impl_mask_all_any, m32x8],
+    [impl_mask_all_any, m64x2],
+    [impl_mask_all_any, m64x4]
+);
diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs
new file mode 100644
index 0000000000..448587b795
--- /dev/null
+++ b/coresimd/ppsv/codegen/mod.rs
@@ -0,0 +1,6 @@
+//! Work arounds for code generation issues
+
+#[cfg(target_arch = "aarch64")]
+pub mod wrapping;
+
+pub mod masks_reductions;
diff --git a/coresimd/ppsv/codegen/wrapping.rs b/coresimd/ppsv/codegen/wrapping.rs
new file mode 100644
index 0000000000..0e2f306eb0
--- /dev/null
+++ b/coresimd/ppsv/codegen/wrapping.rs
@@ -0,0 +1,42 @@
+//! Used by the wrapping_sum and wrapping_product algorithms for AArch64.
+
+pub(crate) trait Wrapping {
+    fn add(self, other: Self) -> Self;
+    fn mul(self, other: Self) -> Self;
+}
+
+macro_rules! int_impl {
+    ($id:ident) => {
+        impl Wrapping for $id {
+            fn add(self, other: Self) -> Self {
+                self.wrapping_add(other)
+            }
+            fn mul(self, other: Self) -> Self {
+                self.wrapping_mul(other)
+            }
+        }
+    };
+}
+int_impl!(i8);
+int_impl!(i16);
+int_impl!(i32);
+int_impl!(i64);
+int_impl!(u8);
+int_impl!(u16);
+int_impl!(u32);
+int_impl!(u64);
+
+macro_rules! float_impl {
+    ($id:ident) => {
+        impl Wrapping for $id {
+            fn add(self, other: Self) -> Self {
+                self + other
+            }
+            fn mul(self, other: Self) -> Self {
+                self * other
+            }
+        }
+    };
+}
+float_impl!(f32);
+float_impl!(f64);
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 08b7ce80d6..0d48509e3a 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -80,51 +80,5 @@ impl<T> FromBits<T> for T {
     }
 }
 
-/// Workarounds code generation issues.
-#[cfg(target_arch = "aarch64")]
-mod codegen {
-    #[cfg(target_arch = "aarch64")]
-    pub mod wrapping {
-        pub trait Wrapping {
-            fn add(self, other: Self) -> Self;
-            fn mul(self, other: Self) -> Self;
-        }
-
-        macro_rules! int_impl {
-            ($id:ident) => {
-                impl Wrapping for $id {
-                    fn add(self, other: Self) -> Self {
-                        self.wrapping_add(other)
-                    }
-                    fn mul(self, other: Self) -> Self {
-                        self.wrapping_mul(other)
-                    }
-                }
-            };
-        }
-        int_impl!(i8);
-        int_impl!(i16);
-        int_impl!(i32);
-        int_impl!(i64);
-        int_impl!(u8);
-        int_impl!(u16);
-        int_impl!(u32);
-        int_impl!(u64);
-
-        macro_rules! float_impl {
-            ($id:ident) => {
-                impl Wrapping for $id {
-                    fn add(self, other: Self) -> Self {
-                        self + other
-                    }
-                    fn mul(self, other: Self) -> Self {
-                        self * other
-                    }
-                }
-            };
-        }
-        float_impl!(f32);
-        float_impl!(f64);
-    }
-
-}
+/// Work arounds code generation issues.
+mod codegen;
diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 5320bcba37..1963e0720c 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -33,12 +33,18 @@ pub fn assert_instr(
     };
 
     let instr = &invoc.instr;
-    let maybe_ignore = if cfg!(optimized) {
+    let name = &func.ident;
+
+    // Disable assert_instr for x86 targets compiled with avx enabled, which
+    // causes LLVM to generate different intrinsics that the ones we are testing
+    // for.
+    let disable_assert_instr = std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
+    let maybe_ignore = if cfg!(optimized) && !disable_assert_instr {
         TokenStream::empty()
     } else {
         (quote! { #[ignore] }).into()
     };
-    let name = &func.ident;
+
     use quote::ToTokens;
     let instr_str = instr
         .clone()
diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs
index be097b9779..aaf61563bd 100644
--- a/crates/coresimd/src/lib.rs
+++ b/crates/coresimd/src/lib.rs
@@ -13,7 +13,7 @@
            simd_ffi, asm,
            integer_atomics, stmt_expr_attributes, core_intrinsics,
            crate_in_paths, no_core, attr_literals, rustc_attrs, stdsimd,
-           staged_api, fn_must_use, core_float, core_slice_ext, align_offset,
+           staged_api, core_float, core_slice_ext, align_offset,
            doc_cfg, mmx_target_feature, tbm_target_feature,
            sse4a_target_feature, arm_target_feature, aarch64_target_feature,
            mips_target_feature)]
diff --git a/examples/nbody.rs b/examples/nbody.rs
index c67fbdc561..8df5aa290c 100644
--- a/examples/nbody.rs
+++ b/examples/nbody.rs
@@ -59,9 +59,9 @@ impl Frsqrt for f64x2 {
                       all(target_arch = "aarch64",
                           target_feature = "neon"))))]
         {
-            self.replace(0, 1. / self.extract(0).sqrt());
-            self.replace(1, 1. / self.extract(1).sqrt());
-            *self
+            let r = self.replace(0, 1. / self.extract(0).sqrt());
+            let r = r.replace(1, 1. / self.extract(1).sqrt());
+            r
         }
     }
 }