diff --git a/coresimd/src/lib.rs b/coresimd/src/lib.rs index a3c02fcaff..dfca161ccc 100644 --- a/coresimd/src/lib.rs +++ b/coresimd/src/lib.rs @@ -13,7 +13,7 @@ #![allow(unused_features)] #![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, target_feature, cfg_target_feature, i128_type, asm, - const_atomic_usize_new, stmt_expr_attributes, core_intrinsics, + integer_atomics, stmt_expr_attributes, core_intrinsics, crate_in_paths)] #![cfg_attr(test, feature(proc_macro, test, attr_literals))] #![cfg_attr(feature = "cargo-clippy", diff --git a/coresimd/src/runtime/aarch64.rs b/coresimd/src/runtime/aarch64.rs index fbbf856855..941a614171 100644 --- a/coresimd/src/runtime/aarch64.rs +++ b/coresimd/src/runtime/aarch64.rs @@ -1,5 +1,5 @@ //! Run-time feature detection on ARM Aarch64. -use runtime::bit; +use runtime::cache; use runtime::arch::HasFeature; #[macro_export] @@ -32,12 +32,12 @@ pub enum __Feature { pmull, } -pub fn detect_features(mut x: T) -> usize { - let mut value: usize = 0; +pub fn detect_features(mut x: T) -> cache::Initializer { + let mut value = cache::Initializer::default(); { let mut enable_feature = |f| { if x.has_feature(&f) { - value = bit::set(value, f as u32); + value.set(f as u32); } }; enable_feature(__Feature::asimd); diff --git a/coresimd/src/runtime/arm.rs b/coresimd/src/runtime/arm.rs index 4c4dbb4cbc..a4e9e89e51 100644 --- a/coresimd/src/runtime/arm.rs +++ b/coresimd/src/runtime/arm.rs @@ -1,5 +1,5 @@ //! Run-time feature detection on ARM Aarch32. -use runtime::bit; +use runtime::cache; use runtime::arch::HasFeature; #[macro_export] @@ -28,12 +28,12 @@ pub enum __Feature { pmull, } -pub fn detect_features(mut x: T) -> usize { - let mut value: usize = 0; +pub fn detect_features(mut x: T) -> cache::Initializer { + let mut value = cache::Initializer::default(); { let mut enable_feature = |f| { if x.has_feature(&f) { - value = bit::set(value, f as u32); + value.set(f as u32); } }; enable_feature(__Feature::neon); diff --git a/coresimd/src/runtime/bit.rs b/coresimd/src/runtime/bit.rs index 42483e5225..e7b0ab61b3 100644 --- a/coresimd/src/runtime/bit.rs +++ b/coresimd/src/runtime/bit.rs @@ -1,11 +1,7 @@ -//! Bit manipulation utilities - -/// Sets the `bit` of `x`. -pub const fn set(x: usize, bit: u32) -> usize { - x | 1 << bit -} +//! Bit manipulation utilities. /// Tests the `bit` of `x`. -pub const fn test(x: usize, bit: u32) -> bool { +pub fn test(x: usize, bit: u32) -> bool { + debug_assert!(bit < 32, "bit index out-of-bounds"); x & (1 << bit) != 0 } diff --git a/coresimd/src/runtime/cache.rs b/coresimd/src/runtime/cache.rs index bb247fb531..ef4acf0a9c 100644 --- a/coresimd/src/runtime/cache.rs +++ b/coresimd/src/runtime/cache.rs @@ -1,20 +1,136 @@ -//! Cache of run-time feature detection +//! Caches run-time feature detection so that it only needs to be computed +//! once. -use core::sync::atomic::{AtomicUsize, Ordering}; -use core::usize; +use core::sync::atomic::Ordering; -use super::bit; +#[cfg(target_pointer_width = "64")] +use core::sync::atomic::AtomicU64; -/// This global variable is a bitset used to cache the features supported by -/// the -/// CPU. -static CACHE: AtomicUsize = AtomicUsize::new(usize::MAX); +#[cfg(target_pointer_width = "32")] +use core::sync::atomic::AtomicU32; + +/// Sets the `bit` of `x`. +pub const fn set_bit(x: u64, bit: u32) -> u64 { + x | 1 << bit +} + +/// Tests the `bit` of `x`. +pub const fn test_bit(x: u64, bit: u32) -> bool { + x & (1 << bit) != 0 +} + +/// Maximum number of features that can be cached. +const CACHE_CAPACITY: u32 = 63; + +/// This type is used to initialize the cache +pub struct Initializer(u64); + +impl Default for Initializer { + fn default() -> Self { + Initializer(0) + } +} + +impl Initializer { + /// Tests the `bit` of the cache. + pub fn test(&self, bit: u32) -> bool { + // FIXME: this way of making sure that the cache is large enough is + // brittle. + debug_assert!( + bit < CACHE_CAPACITY, + "too many features, time to increase the cache size!" + ); + test_bit(self.0, bit) + } + /// Sets the `bit` of the cache. + pub fn set(&mut self, bit: u32) { + // FIXME: this way of making sure that the cache is large enough is + // brittle. + debug_assert!( + bit < CACHE_CAPACITY, + "too many features, time to increase the cache size!" + ); + let v = self.0; + self.0 = set_bit(v, bit); + } +} + +/// This global variable is a cache of the features supported by the CPU. +static CACHE: Cache = Cache::uninitialized(); + +/// Feature cache with capacity for `CACHE_CAPACITY` features. +/// +/// Note: the last feature bit is used to represent an +/// uninitialized cache. +#[cfg(target_pointer_width = "64")] +struct Cache(AtomicU64); + +#[cfg(target_pointer_width = "64")] +impl Cache { + /// Creates an uninitialized cache. + const fn uninitialized() -> Self { + Cache(AtomicU64::new(u64::max_value())) + } + /// Is the cache uninitialized? + pub fn is_uninitialized(&self) -> bool { + self.0.load(Ordering::Relaxed) == u64::max_value() + } + + /// Is the `bit` in the cache set? + pub fn test(&self, bit: u32) -> bool { + test_bit(CACHE.0.load(Ordering::Relaxed), bit) + } + + /// Initializes the cache. + pub fn initialize(&self, value: Initializer) { + self.0.store(value.0, Ordering::Relaxed); + } +} + +/// Feature cache with capacity for `CACHE_CAPACITY` features. +/// +/// Note: the last feature bit is used to represent an +/// uninitialized cache. +#[cfg(target_pointer_width = "32")] +struct Cache(AtomicU32, AtomicU32); + +#[cfg(target_pointer_width = "32")] +impl Cache { + /// Creates an uninitialized cache. + const fn uninitialized() -> Self { + Cache( + AtomicU32::new(u32::max_value()), + AtomicU32::new(u32::max_value()), + ) + } + /// Is the cache uninitialized? + pub fn is_uninitialized(&self) -> bool { + self.1.load(Ordering::Relaxed) == u32::max_value() + } + + /// Is the `bit` in the cache set? + pub fn test(&self, bit: u32) -> bool { + if bit < 32 { + test_bit(CACHE.0.load(Ordering::Relaxed) as u64, bit) + } else { + test_bit(CACHE.1.load(Ordering::Relaxed) as u64, bit - 32) + } + } + + /// Initializes the cache. + pub fn initialize(&self, value: Initializer) { + let lo: u32 = value.0 as u32; + let hi: u32 = (value.0 >> 32) as u32; + self.0.store(lo, Ordering::Relaxed); + self.1.store(hi, Ordering::Relaxed); + } +} /// Test the `bit` of the storage. If the storage has not been initialized, /// initializes it with the result of `f()`. /// /// On its first invocation, it detects the CPU features and caches them in the -/// `FEATURES` global variable as an `AtomicUsize`. +/// `FEATURES` global variable as an `AtomicU64`. /// /// It uses the `__Feature` variant to index into this variable as a bitset. If /// the bit is set, the feature is enabled, and otherwise it is disabled. @@ -22,10 +138,10 @@ static CACHE: AtomicUsize = AtomicUsize::new(usize::MAX); /// PLEASE: do not use this, it is an implementation detail subject to change. pub fn test(bit: u32, f: F) -> bool where - F: FnOnce() -> usize, + F: FnOnce() -> Initializer, { - if CACHE.load(Ordering::Relaxed) == usize::MAX { - CACHE.store(f(), Ordering::Relaxed); + if CACHE.is_uninitialized() { + CACHE.initialize(f()); } - bit::test(CACHE.load(Ordering::Relaxed), bit) + CACHE.test(bit) } diff --git a/coresimd/src/runtime/powerpc64.rs b/coresimd/src/runtime/powerpc64.rs index 8ee02e4185..16ed59a717 100644 --- a/coresimd/src/runtime/powerpc64.rs +++ b/coresimd/src/runtime/powerpc64.rs @@ -1,5 +1,5 @@ //! Run-time feature detection on PowerPC64. -use runtime::bit; +use runtime::cache; use runtime::arch::HasFeature; #[macro_export] @@ -33,12 +33,12 @@ pub enum __Feature { power8, } -pub fn detect_features(mut x: T) -> usize { - let mut value: usize = 0; +pub fn detect_features(mut x: T) -> cache::Initializer { + let mut value = cache::Initializer::default(); { let mut enable_feature = |f| { if x.has_feature(&f) { - value = bit::set(value, f as u32); + value.set(f as u32); } }; enable_feature(__Feature::altivec); diff --git a/coresimd/src/runtime/x86.rs b/coresimd/src/runtime/x86.rs index a994e61c42..9b97deef29 100644 --- a/coresimd/src/runtime/x86.rs +++ b/coresimd/src/runtime/x86.rs @@ -18,7 +18,7 @@ use core::mem; -use super::bit; +use super::{bit, cache}; /// This macro maps the string-literal feature names to values of the /// `__Feature` enum at compile-time. The feature names used are the same as @@ -29,6 +29,12 @@ use super::bit; #[macro_export] #[doc(hidden)] macro_rules! __unstable_detect_feature { + ("aes", $unstable_detect_feature:path) => { + $unstable_detect_feature( + $crate::__vendor_runtime::__Feature::aes{}) }; + ("tsc", $unstable_detect_feature:path) => { + $unstable_detect_feature( + $crate::__vendor_runtime::__Feature::tsc{}) }; ("mmx", $unstable_detect_feature:path) => { $unstable_detect_feature( $crate::__vendor_runtime::__Feature::mmx{}) }; @@ -168,6 +174,10 @@ macro_rules! __unstable_detect_feature { #[allow(non_camel_case_types)] #[repr(u8)] pub enum __Feature { + /// AES (Advanced Encryption Standard New Instructions AES-NI) + aes, + /// TSC (Time Stamp Counter) + tsc, /// MMX mmx, /// SSE (Streaming SIMD Extensions) @@ -232,7 +242,8 @@ pub enum __Feature { xsaves, /// XSAVEC (Save Processor Extended States Compacted) xsavec, - #[doc(hidden)] __NonExhaustive, + #[doc(hidden)] + __NonExhaustive, } /// Run-time feature detection on x86 works by using the CPUID instruction. @@ -250,10 +261,10 @@ pub enum __Feature { /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf -pub fn detect_features() -> usize { +pub fn detect_features() -> cache::Initializer { use vendor::{__cpuid, __cpuid_count, has_cpuid, CpuidResult}; use vendor::_xgetbv; - let mut value: usize = 0; + let mut value = cache::Initializer::default(); // If the x86 CPU does not support the CPUID instruction then it is too // old to support any of the currently-detectable features. @@ -329,7 +340,7 @@ pub fn detect_features() -> usize { // borrows value till the end of this scope: let mut enable = |r, rb, f| { if bit::test(r as usize, rb) { - value = bit::set(value, f as u32); + value.set(f as u32); } }; @@ -339,8 +350,10 @@ pub fn detect_features() -> usize { enable(proc_info_ecx, 19, __Feature::sse4_1); enable(proc_info_ecx, 20, __Feature::sse4_2); enable(proc_info_ecx, 23, __Feature::popcnt); - enable(proc_info_edx, 24, __Feature::fxsr); + enable(proc_info_ecx, 25, __Feature::aes); + enable(proc_info_edx, 4, __Feature::tsc); enable(proc_info_edx, 23, __Feature::mmx); + enable(proc_info_edx, 24, __Feature::fxsr); enable(proc_info_edx, 25, __Feature::sse); enable(proc_info_edx, 26, __Feature::sse2); @@ -449,6 +462,8 @@ mod tests { #[test] fn dump() { + println!("aes: {:?}", cfg_feature_enabled!("aes")); + println!("tsc: {:?}", cfg_feature_enabled!("tsc")); println!("sse: {:?}", cfg_feature_enabled!("sse")); println!("sse2: {:?}", cfg_feature_enabled!("sse2")); println!("sse3: {:?}", cfg_feature_enabled!("sse3")); @@ -488,6 +503,8 @@ mod tests { #[test] fn compare_with_cupid() { let information = cupid::master().unwrap(); + assert_eq!(cfg_feature_enabled!("aes"), information.aesni()); + assert_eq!(cfg_feature_enabled!("tsc"), information.tsc()); assert_eq!(cfg_feature_enabled!("sse"), information.sse()); assert_eq!(cfg_feature_enabled!("sse2"), information.sse2()); assert_eq!(cfg_feature_enabled!("sse3"), information.sse3()); diff --git a/coresimd/src/v128.rs b/coresimd/src/v128.rs index cc5888f616..cf98340d83 100644 --- a/coresimd/src/v128.rs +++ b/coresimd/src/v128.rs @@ -42,86 +42,14 @@ define_impl! { x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 } -define_from!( - u64x2, - i64x2, - u32x4, - i32x4, - u16x8, - i16x8, - u8x16, - i8x16 -); -define_from!( - i64x2, - u64x2, - u32x4, - i32x4, - u16x8, - i16x8, - u8x16, - i8x16 -); -define_from!( - u32x4, - u64x2, - i64x2, - i32x4, - u16x8, - i16x8, - u8x16, - i8x16 -); -define_from!( - i32x4, - u64x2, - i64x2, - u32x4, - u16x8, - i16x8, - u8x16, - i8x16 -); -define_from!( - u16x8, - u64x2, - i64x2, - u32x4, - i32x4, - i16x8, - u8x16, - i8x16 -); -define_from!( - i16x8, - u64x2, - i64x2, - u32x4, - i32x4, - u16x8, - u8x16, - i8x16 -); -define_from!( - u8x16, - u64x2, - i64x2, - u32x4, - i32x4, - u16x8, - i16x8, - i8x16 -); -define_from!( - i8x16, - u64x2, - i64x2, - u32x4, - i32x4, - u16x8, - i16x8, - u8x16 -); +define_from!(u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); +define_from!(i64x2, u64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); +define_from!(u32x4, u64x2, i64x2, i32x4, u16x8, i16x8, u8x16, i8x16); +define_from!(i32x4, u64x2, i64x2, u32x4, u16x8, i16x8, u8x16, i8x16); +define_from!(u16x8, u64x2, i64x2, u32x4, i32x4, i16x8, u8x16, i8x16); +define_from!(i16x8, u64x2, i64x2, u32x4, i32x4, u16x8, u8x16, i8x16); +define_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16); +define_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16); define_common_ops!( f64x2, diff --git a/coresimd/src/v256.rs b/coresimd/src/v256.rs index 1cbe7bdf26..1f02bad0f2 100644 --- a/coresimd/src/v256.rs +++ b/coresimd/src/v256.rs @@ -66,86 +66,14 @@ define_impl! { x24, x25, x26, x27, x28, x29, x30, x31 } -define_from!( - u64x4, - i64x4, - u32x8, - i32x8, - u16x16, - i16x16, - u8x32, - i8x32 -); -define_from!( - i64x4, - u64x4, - u32x8, - i32x8, - u16x16, - i16x16, - u8x32, - i8x32 -); -define_from!( - u32x8, - u64x4, - i64x4, - i32x8, - u16x16, - i16x16, - u8x32, - i8x32 -); -define_from!( - i32x8, - u64x4, - i64x4, - u32x8, - u16x16, - i16x16, - u8x32, - i8x32 -); -define_from!( - u16x16, - u64x4, - i64x4, - u32x8, - i32x8, - i16x16, - u8x32, - i8x32 -); -define_from!( - i16x16, - u64x4, - i64x4, - u32x8, - i32x8, - u16x16, - u8x32, - i8x32 -); -define_from!( - u8x32, - u64x4, - i64x4, - u32x8, - i32x8, - u16x16, - i16x16, - i8x32 -); -define_from!( - i8x32, - u64x4, - i64x4, - u32x8, - i32x8, - u16x16, - i16x16, - u8x32 -); +define_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32); +define_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32); +define_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32); +define_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32); +define_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32); define_common_ops!( f64x4, diff --git a/coresimd/src/x86/i386/mod.rs b/coresimd/src/x86/i386/mod.rs index 9f32390924..4e0adbae72 100644 --- a/coresimd/src/x86/i386/mod.rs +++ b/coresimd/src/x86/i386/mod.rs @@ -3,7 +3,6 @@ mod eflags; pub use self::eflags::*; - #[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature mod fxsr; #[cfg(dont_compile_me)] // TODO: need to upstream `fxsr` target feature diff --git a/coresimd/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs index cba133c734..3552fe820b 100644 --- a/coresimd/src/x86/i586/avx.rs +++ b/coresimd/src/x86/i586/avx.rs @@ -956,7 +956,6 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 { #[target_feature(enable = "avx,sse")] #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 { - let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ($a:expr, $b:expr, $c:expr, $d:expr) => { @@ -1094,7 +1093,9 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d { #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))] -pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 { +pub unsafe fn _mm256_permute2f128_ps( + a: __m256, b: __m256, imm8: i32 +) -> __m256 { macro_rules! call { ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) } } @@ -1106,7 +1107,9 @@ pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] -pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d { +pub unsafe fn _mm256_permute2f128_pd( + a: __m256d, b: __m256d, imm8: i32 +) -> __m256d { macro_rules! call { ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) } } @@ -1195,7 +1198,9 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 { #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] -pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d { +pub unsafe fn _mm256_insertf128_pd( + a: __m256d, b: __m128d, imm8: i32 +) -> __m256d { match imm8 & 1 { 0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]), _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]), @@ -1391,7 +1396,9 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] -pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d { +pub unsafe fn _mm256_maskload_pd( + mem_addr: *const f64, mask: __m256i +) -> __m256d { maskloadpd256(mem_addr as *const i8, mask.as_i64x4()) } @@ -1400,7 +1407,9 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovpd))] -pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) { +pub unsafe fn _mm256_maskstore_pd( + mem_addr: *mut f64, mask: __m256i, a: __m256d +) { maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a); } @@ -1429,7 +1438,9 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] -pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 { +pub unsafe fn _mm256_maskload_ps( + mem_addr: *const f32, mask: __m256i +) -> __m256 { maskloadps256(mem_addr as *const i8, mask.as_i32x8()) } @@ -1438,7 +1449,9 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 #[inline] #[target_feature(enable = "avx")] #[cfg_attr(test, assert_instr(vmaskmovps))] -pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) { +pub unsafe fn _mm256_maskstore_ps( + mem_addr: *mut f32, mask: __m256i, a: __m256 +) { maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a); } @@ -2755,7 +2768,8 @@ mod tests { let result_closest = _mm256_round_ps(a, 0b00000000); let result_down = _mm256_round_ps(a, 0b00000001); let result_up = _mm256_round_ps(a, 0b00000010); - let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.); + let expected_closest = + _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.); let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.); let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.); assert_eq_m256(result_closest, expected_closest); @@ -2865,7 +2879,8 @@ mod tests { let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); let r = _mm256_dp_ps(a, b, 0xFF); - let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.); + let e = + _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.); assert_eq_m256(r, e); } @@ -3282,11 +3297,17 @@ mod tests { #[simd_test = "avx"] unsafe fn test_mm256_insert_epi16() { - let a = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r = _mm256_insert_epi16(a, 0, 15); - let e = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 0, + ); assert_eq_m256i(r, e); } @@ -3865,10 +3886,12 @@ mod tests { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ); - assert_eq_m256i( - r, - _mm256_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm256_setr_epi16( + 16, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, ); + assert_eq_m256i(r, e); } #[simd_test = "avx"] @@ -3922,10 +3945,12 @@ mod tests { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ); - assert_eq_m256i( - r, - _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm256_setr_epi16( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, ); + assert_eq_m256i(r, e); } #[simd_test = "avx"] @@ -4196,8 +4221,11 @@ mod tests { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ); - let lo = - _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + #[cfg_attr(rustfmt, rustfmt_skip)] + let lo = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); let r = _mm256_loadu2_m128i( &hi as *const _ as *const _, &lo as *const _ as *const _, diff --git a/coresimd/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs index 540031009e..aff7e4d16c 100644 --- a/coresimd/src/x86/i586/avx2.rs +++ b/coresimd/src/x86/i586/avx2.rs @@ -201,7 +201,10 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vandnps))] pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { let all_ones = _mm256_set1_epi8(-1); - mem::transmute(simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4())) + mem::transmute(simd_and( + simd_xor(a.as_i64x4(), all_ones.as_i64x4()), + b.as_i64x4(), + )) } /// Average packed unsigned 16-bit integers in `a` and `b`. @@ -256,7 +259,9 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))] -pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { +pub unsafe fn _mm256_blend_epi32( + a: __m256i, b: __m256i, imm8: i32 +) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; let a = a.as_i32x8(); let b = b.as_i32x8(); @@ -308,7 +313,9 @@ pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))] -pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { +pub unsafe fn _mm256_blend_epi16( + a: __m256i, b: __m256i, imm8: i32 +) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; let a = a.as_i16x16(); let b = b.as_i16x16(); @@ -362,7 +369,9 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendvb))] -pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { +pub unsafe fn _mm256_blendv_epi8( + a: __m256i, b: __m256i, mask: __m256i +) -> __m256i { mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) } @@ -776,7 +785,8 @@ pub unsafe fn _mm_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi32( - src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, scale: i32 + src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, + scale: i32, ) -> __m128i { let src = src.as_i32x4(); let mask = mask.as_i32x4(); @@ -817,7 +827,8 @@ pub unsafe fn _mm256_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi32( - src: __m256i, slice: *const i32, offsets: __m256i, mask: __m256i, scale: i32 + src: __m256i, slice: *const i32, offsets: __m256i, mask: __m256i, + scale: i32, ) -> __m256i { let src = src.as_i32x8(); let mask = mask.as_i32x8(); @@ -932,7 +943,8 @@ pub unsafe fn _mm_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi64( - src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, scale: i32 + src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, + scale: i32, ) -> __m128i { let src = src.as_i64x2(); let mask = mask.as_i64x2(); @@ -973,7 +985,8 @@ pub unsafe fn _mm256_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi64( - src: __m256i, slice: *const i64, offsets: __m128i, mask: __m256i, scale: i32 + src: __m256i, slice: *const i64, offsets: __m128i, mask: __m256i, + scale: i32, ) -> __m256i { let src = src.as_i64x4(); let mask = mask.as_i64x4(); @@ -1013,7 +1026,8 @@ pub unsafe fn _mm_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm_mask_i32gather_pd( - src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, scale: i32 + src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, + scale: i32, ) -> __m128d { let offsets = offsets.as_i32x4(); let slice = slice as *const i8; @@ -1050,7 +1064,8 @@ pub unsafe fn _mm256_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_pd( - src: __m256d, slice: *const f64, offsets: __m128i, mask: __m256d, scale: i32 + src: __m256d, slice: *const f64, offsets: __m128i, mask: __m256d, + scale: i32, ) -> __m256d { let offsets = offsets.as_i32x4(); let slice = slice as *const i8; @@ -1088,7 +1103,8 @@ pub unsafe fn _mm_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi32( - src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, scale: i32 + src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, + scale: i32, ) -> __m128i { let src = src.as_i32x4(); let mask = mask.as_i32x4(); @@ -1129,7 +1145,8 @@ pub unsafe fn _mm256_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi32( - src: __m128i, slice: *const i32, offsets: __m256i, mask: __m128i, scale: i32 + src: __m128i, slice: *const i32, offsets: __m256i, mask: __m128i, + scale: i32, ) -> __m128i { let src = src.as_i32x4(); let mask = mask.as_i32x4(); @@ -1244,7 +1261,8 @@ pub unsafe fn _mm_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi64( - src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, scale: i32 + src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, + scale: i32, ) -> __m128i { let src = src.as_i64x2(); let mask = mask.as_i64x2(); @@ -1285,7 +1303,8 @@ pub unsafe fn _mm256_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi64( - src: __m256i, slice: *const i64, offsets: __m256i, mask: __m256i, scale: i32 + src: __m256i, slice: *const i64, offsets: __m256i, mask: __m256i, + scale: i32, ) -> __m256i { let src = src.as_i64x4(); let mask = mask.as_i64x4(); @@ -1325,7 +1344,8 @@ pub unsafe fn _mm_i64gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm_mask_i64gather_pd( - src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, scale: i32 + src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, + scale: i32, ) -> __m128d { let slice = slice as *const i8; let offsets = offsets.as_i64x2(); @@ -1362,7 +1382,8 @@ pub unsafe fn _mm256_i64gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_pd( - src: __m256d, slice: *const f64, offsets: __m256i, mask: __m256d, scale: i32 + src: __m256d, slice: *const f64, offsets: __m256i, mask: __m256d, + scale: i32, ) -> __m256d { let slice = slice as *const i8; let offsets = offsets.as_i64x4(); @@ -1416,7 +1437,9 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] -pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i { +pub unsafe fn _mm_maskload_epi32( + mem_addr: *const i32, mask: __m128i +) -> __m128i { mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4())) } @@ -1438,7 +1461,9 @@ pub unsafe fn _mm256_maskload_epi32( #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] -pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i { +pub unsafe fn _mm_maskload_epi64( + mem_addr: *const i64, mask: __m128i +) -> __m128i { mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2())) } @@ -1460,7 +1485,9 @@ pub unsafe fn _mm256_maskload_epi64( #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] -pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) { +pub unsafe fn _mm_maskstore_epi32( + mem_addr: *mut i32, mask: __m128i, a: __m128i +) { maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4()) } @@ -1482,7 +1509,9 @@ pub unsafe fn _mm256_maskstore_epi32( #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] -pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) { +pub unsafe fn _mm_maskstore_epi64( + mem_addr: *mut i64, mask: __m128i, a: __m128i +) { maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2()) } @@ -1625,7 +1654,9 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { #[inline] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))] -pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i { +pub unsafe fn _mm256_mpsadbw_epu8( + a: __m256i, b: __m256i, imm8: i32 +) -> __m256i { let a = a.as_u8x32(); let b = b.as_u8x32(); macro_rules! call { @@ -2733,7 +2764,11 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhdq))] pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { - let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]); + let r: i32x8 = simd_shuffle8( + a.as_i32x8(), + b.as_i32x8(), + [2, 10, 3, 11, 6, 14, 7, 15], + ); mem::transmute(r) } @@ -2769,7 +2804,8 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckldq))] pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { - let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); + let r: i32x8 = + simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); mem::transmute(r) } @@ -3113,19 +3149,23 @@ extern "C" { ) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.pd"] fn pgatherdpd( - src: __m128d, slice: *const i8, offsets: i32x4, mask: __m128d, scale: i8 + src: __m128d, slice: *const i8, offsets: i32x4, mask: __m128d, + scale: i8, ) -> __m128d; #[link_name = "llvm.x86.avx2.gather.d.pd.256"] fn vpgatherdpd( - src: __m256d, slice: *const i8, offsets: i32x4, mask: __m256d, scale: i8 + src: __m256d, slice: *const i8, offsets: i32x4, mask: __m256d, + scale: i8, ) -> __m256d; #[link_name = "llvm.x86.avx2.gather.q.pd"] fn pgatherqpd( - src: __m128d, slice: *const i8, offsets: i64x2, mask: __m128d, scale: i8 + src: __m128d, slice: *const i8, offsets: i64x2, mask: __m128d, + scale: i8, ) -> __m128d; #[link_name = "llvm.x86.avx2.gather.q.pd.256"] fn vpgatherqpd( - src: __m256d, slice: *const i8, offsets: i64x4, mask: __m256d, scale: i8 + src: __m256d, slice: *const i8, offsets: i64x4, mask: __m256d, + scale: i8, ) -> __m256d; #[link_name = "llvm.x86.avx2.gather.d.ps"] fn pgatherdps( @@ -3228,10 +3268,16 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_add_epi16() { - let a = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let b = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r = _mm256_add_epi16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] let e = _mm256_setr_epi16( @@ -3313,8 +3359,11 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_adds_epi16() { - let a = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm256_setr_epi16( 32, 33, 34, 35, 36, 37, 38, 39, @@ -3383,8 +3432,11 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_adds_epu16() { - let a = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm256_setr_epi16( 32, 33, 34, 35, 36, 37, 38, 39, @@ -3468,7 +3520,8 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_blend_epi16() { let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9)); - let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3,3, 3, 3, 3, 3); + let e = + _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3); let r = _mm256_blend_epi16(a, b, 0x01 as i32); assert_eq_m256i(r, e); @@ -3604,10 +3657,16 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_cmpeq_epi16() { - let a = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let b = - _mm256_setr_epi16(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = _mm256_setr_epi16( + 15, 14, 2, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, + ); let r = _mm256_cmpeq_epi16(a, b); assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2)); } @@ -3627,10 +3686,7 @@ mod tests { let a = _mm256_setr_epi64x(0, 1, 2, 3); let b = _mm256_setr_epi64x(3, 2, 2, 0); let r = _mm256_cmpeq_epi64(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2), - ); + assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2)); } #[simd_test = "avx2"] @@ -3662,33 +3718,42 @@ mod tests { let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0); let b = _mm256_set1_epi64x(0); let r = _mm256_cmpgt_epi64(a, b); - assert_eq_m256i( - r, - _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0), - ); + assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0)); } #[simd_test = "avx2"] unsafe fn test_mm256_cvtepi8_epi16() { - let a = - _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); - let r = - _mm256_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 0, -1, 1, -2, 2, -3, 3, + -4, 4, -5, 5, -6, 6, -7, 7, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let r = _mm256_setr_epi16( + 0, 0, -1, 1, -2, 2, -3, 3, + -4, 4, -5, 5, -6, 6, -7, 7, + ); assert_eq_m256i(r, _mm256_cvtepi8_epi16(a)); } #[simd_test = "avx2"] unsafe fn test_mm256_cvtepi8_epi32() { - let a = - _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 0, -1, 1, -2, 2, -3, 3, + -4, 4, -5, 5, -6, 6, -7, 7, + ); let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3); assert_eq_m256i(r, _mm256_cvtepi8_epi32(a)); } #[simd_test = "avx2"] unsafe fn test_mm256_cvtepi8_epi64() { - let a = - _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 0, -1, 1, -2, 2, -3, 3, + -4, 4, -5, 5, -6, 6, -7, 7, + ); let r = _mm256_setr_epi64x(0, 0, -1, 1); assert_eq_m256i(r, _mm256_cvtepi8_epi64(a)); } @@ -3737,25 +3802,37 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_cvtepu8_epi16() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = - _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let r = _mm256_setr_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); assert_eq_m256i(r, _mm256_cvtepu8_epi16(a)); } #[simd_test = "avx2"] unsafe fn test_mm256_cvtepu8_epi32() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); assert_eq_m256i(r, _mm256_cvtepu8_epi32(a)); } #[simd_test = "avx2"] unsafe fn test_mm256_cvtepu8_epi64() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r = _mm256_setr_epi64x(0, 1, 2, 3); assert_eq_m256i(r, _mm256_cvtepu8_epi64(a)); } @@ -3773,7 +3850,8 @@ mod tests { let a = _mm256_set1_epi16(2); let b = _mm256_set1_epi16(4); let r = _mm256_hadd_epi16(a, b); - let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + let e = + _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq_m256i(r, e); } @@ -3793,8 +3871,11 @@ mod tests { let a = _mm256_insert_epi16(a, 1, 1); let b = _mm256_set1_epi16(4); let r = _mm256_hadds_epi16(a, b); - let e = - _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm256_setr_epi16( + 0x7FFF, 4, 4, 4, 8, 8, 8, 8, + 4, 4, 4, 4, 8, 8, 8, 8, + ); assert_eq_m256i(r, e); } @@ -4139,7 +4220,8 @@ mod tests { let a = _mm256_set1_epi32(2); let b = _mm256_set1_epi32(4); let r = _mm256_packs_epi32(a, b); - let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + let e = + _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq_m256i(r, e); } @@ -4165,7 +4247,8 @@ mod tests { let a = _mm256_set1_epi32(2); let b = _mm256_set1_epi32(4); let r = _mm256_packus_epi32(a, b); - let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + let e = + _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq_m256i(r, e); } @@ -4266,7 +4349,7 @@ mod tests { unsafe fn test_mm256_slli_epi16() { assert_eq_m256i( _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4), - _mm256_set1_epi16(0xFF0) + _mm256_set1_epi16(0xFF0), ); } @@ -4274,7 +4357,7 @@ mod tests { unsafe fn test_mm256_slli_epi32() { assert_eq_m256i( _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4), - _mm256_set1_epi32(0xFFFF0) + _mm256_set1_epi32(0xFFFF0), ); } @@ -4282,7 +4365,7 @@ mod tests { unsafe fn test_mm256_slli_epi64() { assert_eq_m256i( _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4), - _mm256_set1_epi64x(0xFFFFFFFF0) + _mm256_set1_epi64x(0xFFFFFFFF0), ); } @@ -4349,7 +4432,7 @@ mod tests { unsafe fn test_mm256_srai_epi16() { assert_eq_m256i( _mm256_srai_epi16(_mm256_set1_epi16(-1), 1), - _mm256_set1_epi16(-1) + _mm256_set1_epi16(-1), ); } @@ -4357,7 +4440,7 @@ mod tests { unsafe fn test_mm256_srai_epi32() { assert_eq_m256i( _mm256_srai_epi32(_mm256_set1_epi32(-1), 1), - _mm256_set1_epi32(-1) + _mm256_set1_epi32(-1), ); } @@ -4427,7 +4510,7 @@ mod tests { unsafe fn test_mm256_srli_epi16() { assert_eq_m256i( _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4), - _mm256_set1_epi16(0xF) + _mm256_set1_epi16(0xF), ); } @@ -4435,7 +4518,7 @@ mod tests { unsafe fn test_mm256_srli_epi32() { assert_eq_m256i( _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4), - _mm256_set1_epi32(0xFFF) + _mm256_set1_epi32(0xFFF), ); } @@ -4443,7 +4526,7 @@ mod tests { unsafe fn test_mm256_srli_epi64() { assert_eq_m256i( _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4), - _mm256_set1_epi64x(0xFFFFFFF) + _mm256_set1_epi64x(0xFFFFFFF), ); } @@ -4639,7 +4722,8 @@ mod tests { unsafe fn test_mm256_permutevar8x32_epi32() { let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800); let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4); - let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500); + let expected = + _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500); let r = _mm256_permutevar8x32_epi32(a, b); assert_eq_m256i(r, expected); } @@ -4739,7 +4823,10 @@ mod tests { _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), 4, ); - assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256)); + assert_eq_m256i( + r, + _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256), + ); } #[simd_test = "avx2"] @@ -4789,7 +4876,10 @@ mod tests { _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4), 4, ); - assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0)); + assert_eq_m256( + r, + _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0), + ); } #[simd_test = "avx2"] @@ -4810,7 +4900,7 @@ mod tests { ); assert_eq_m256( r, - _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0) + _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0), ); } @@ -4821,11 +4911,8 @@ mod tests { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = _mm_i32gather_epi64( - arr.as_ptr(), - _mm_setr_epi32(0, 16, 0, 0), - 8, - ); + let r = + _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8); assert_eq_m128i(r, _mm_setr_epi64x(0, 16)); } @@ -4887,8 +4974,7 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = - _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8); + let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8); assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0)); } @@ -5220,8 +5306,11 @@ mod tests { #[simd_test = "avx2"] unsafe fn test_mm256_extract_epi16() { - let a = - _mm256_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm256_setr_epi16( + -1, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r1 = _mm256_extract_epi16(a, 0); let r2 = _mm256_extract_epi16(a, 19); assert_eq!(r1, -1); diff --git a/coresimd/src/x86/i586/rdtsc.rs b/coresimd/src/x86/i586/rdtsc.rs index 6c02a2d03b..a385dd9a01 100644 --- a/coresimd/src/x86/i586/rdtsc.rs +++ b/coresimd/src/x86/i586/rdtsc.rs @@ -1,3 +1,5 @@ +//! RDTSC instructions. + #[cfg(test)] use stdsimd_test::assert_instr; @@ -22,7 +24,7 @@ pub unsafe fn _rdtsc() -> i64 { } /// Reads the current value of the processor’s time-stamp counter and -/// the IA32_TSC_AUX MSR. +/// the `IA32_TSC_AUX MSR`. /// /// The processor monotonically increments the time-stamp counter MSR /// every clock cycle and resets it to 0 whenever the processor is diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs index 3960360588..5eb92b6a07 100644 --- a/coresimd/src/x86/i586/sse.rs +++ b/coresimd/src/x86/i586/sse.rs @@ -994,9 +994,9 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { _mm_load1_ps(p) } -/// Load four `f32` values from *aligned* memory into a `__m128`. If the pointer -/// is not aligned to a 128-bit boundary (16 bytes) a general protection fault -/// will be triggered (fatal program crash). +/// Load four `f32` values from *aligned* memory into a `__m128`. If the +/// pointer is not aligned to a 128-bit boundary (16 bytes) a general +/// protection fault will be triggered (fatal program crash). /// /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned /// memory. @@ -1031,7 +1031,8 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { dst } -/// Load four `f32` values from aligned memory into a `__m128` in reverse order. +/// Load four `f32` values from aligned memory into a `__m128` in reverse +/// order. /// /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general /// protection fault will be triggered (fatal program crash). @@ -1307,8 +1308,8 @@ pub unsafe fn _mm_getcsr() -> u32 { /// check if an operation caused some overflow: /// /// ```rust,ignore -/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags -/// // perform calculations +/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags +/// // perform calculations /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { /// // handle overflow /// } @@ -1331,7 +1332,7 @@ pub unsafe fn _mm_getcsr() -> u32 { /// exception, use: /// /// ```rust,ignore -/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow +/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow /// exception /// ``` /// @@ -1374,8 +1375,8 @@ pub unsafe fn _mm_getcsr() -> u32 { /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: /// /// ```rust,ignore -/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) -/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on +/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) +/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on /// ``` /// #[inline] @@ -2357,14 +2358,9 @@ mod tests { let r = _mm_comieq_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2383,14 +2379,9 @@ mod tests { let r = _mm_comilt_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2409,14 +2400,9 @@ mod tests { let r = _mm_comile_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2435,14 +2421,9 @@ mod tests { let r = _mm_comige_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2461,14 +2442,9 @@ mod tests { let r = _mm_comineq_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2487,14 +2463,9 @@ mod tests { let r = _mm_ucomieq_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2513,14 +2484,9 @@ mod tests { let r = _mm_ucomilt_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2539,14 +2505,9 @@ mod tests { let r = _mm_ucomile_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2565,14 +2526,9 @@ mod tests { let r = _mm_ucomigt_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2591,14 +2547,9 @@ mod tests { let r = _mm_ucomige_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2617,14 +2568,9 @@ mod tests { let r = _mm_ucomineq_ss(a, b); assert_eq!( - ee[i], - r, + ee[i], r, "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r, - ee[i], - i + a, b, r, ee[i], i ); } } @@ -2652,24 +2598,14 @@ mod tests { let s2 = _MM_GET_EXCEPTION_STATE(); assert_eq!( - ee[i], - r1, + ee[i], r1, "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r1, - ee[i], - i + a, b, r1, ee[i], i ); assert_eq!( - ee[i], - r2, + ee[i], r2, "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})", - a, - b, - r2, - ee[i], - i + a, b, r2, ee[i], i ); assert_eq!( s1, @@ -2691,19 +2627,16 @@ mod tests { #[simd_test = "sse"] unsafe fn test_mm_cvtss_si32() { let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; - let result = &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520]; + let result = + &[42i32, -3, i32::min_value(), 0, i32::min_value(), 2147483520]; for i in 0..inputs.len() { let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); let e = result[i]; let r = _mm_cvtss_si32(x); assert_eq!( - e, - r, + e, r, "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", - i, - x, - r, - e + i, x, r, e ); } } @@ -2727,13 +2660,9 @@ mod tests { let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); let r = _mm_cvttss_si32(x); assert_eq!( - e, - r, + e, r, "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", - i, - x, - r, - e + i, x, r, e ); } } @@ -2904,7 +2833,8 @@ mod tests { } let r = _mm_load_ps(p); - let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); + let e = + _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); assert_eq_m128(r, e); } @@ -2934,7 +2864,8 @@ mod tests { } let r = _mm_loadr_ps(p); - let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); + let e = + _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); assert_eq_m128(r, e); } @@ -3146,8 +3077,7 @@ mod tests { let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0); assert_eq_m128(r, exp); - let underflow = - _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; + let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; assert_eq!(underflow, true); } diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs index ab4a574e2d..da2906e115 100644 --- a/coresimd/src/x86/i586/sse2.rs +++ b/coresimd/src/x86/i586/sse2.rs @@ -1,6 +1,5 @@ //! Streaming SIMD Extensions 2 (SSE2) - #[cfg(test)] use stdsimd_test::assert_instr; @@ -878,7 +877,9 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i { #[inline] #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(maskmovdqu))] -pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) { +pub unsafe fn _mm_maskmoveu_si128( + a: __m128i, mask: __m128i, mem_addr: *mut i8 +) { maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr) } @@ -1195,7 +1196,11 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhwd))] pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i { - let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]); + let x = simd_shuffle8( + a.as_i16x8(), + b.as_i16x8(), + [4, 12, 5, 13, 6, 14, 7, 15], + ); mem::transmute::(x) } @@ -1204,7 +1209,11 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhdq))] pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i { - mem::transmute::(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) + mem::transmute::(simd_shuffle4( + a.as_i32x4(), + b.as_i32x4(), + [2, 6, 3, 7], + )) } /// Unpack and interleave 64-bit integers from the high half of `a` and `b`. @@ -1212,7 +1221,11 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckhqdq))] pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i { - mem::transmute::(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3])) + mem::transmute::(simd_shuffle2( + a.as_i64x2(), + b.as_i64x2(), + [1, 3], + )) } /// Unpack and interleave 8-bit integers from the low half of `a` and `b`. @@ -1232,7 +1245,8 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpcklwd))] pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { - let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]); + let x = + simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]); mem::transmute::(x) } @@ -1241,7 +1255,11 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpckldq))] pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i { - mem::transmute::(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) + mem::transmute::(simd_shuffle4( + a.as_i32x4(), + b.as_i32x4(), + [0, 4, 1, 5], + )) } /// Unpack and interleave 64-bit integers from the low half of `a` and `b`. @@ -1249,7 +1267,11 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(punpcklqdq))] pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i { - mem::transmute::(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2])) + mem::transmute::(simd_shuffle2( + a.as_i64x2(), + b.as_i64x2(), + [0, 2], + )) } /// Return a new vector with the low element of `a` replaced by the sum of the @@ -2332,7 +2354,7 @@ mod tests { use stdsimd_test::simd_test; use test::black_box; // Used to inhibit constant-folding. use x86::*; - use v128::*; + use v128::*; #[simd_test = "sse2"] unsafe fn test_mm_pause() { @@ -2357,8 +2379,24 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_add_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = _mm_setr_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -2408,8 +2446,24 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_adds_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = _mm_setr_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -2465,8 +2519,24 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_adds_epu8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = _mm_setr_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -2715,8 +2785,24 @@ mod tests { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ); let r = _mm_slli_si128(a, 1); - let e = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e = _mm_setr_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + ); assert_eq_m128i(r, e); #[cfg_attr(rustfmt, rustfmt_skip)] @@ -2958,8 +3044,24 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_cmpeq_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let a = _mm_setr_epi8( + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + ); let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = _mm_cmpeq_epi8(a, b); @@ -3104,8 +3206,11 @@ mod tests { let r = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); - let e = - _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm_setr_epi8( + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, + ); assert_eq_m128i(r, e); } @@ -3151,8 +3256,11 @@ mod tests { let r = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); - let e = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); assert_eq_m128i(r, e); } @@ -3186,7 +3294,11 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_maskmoveu_si128() { let a = _mm_set1_epi8(9); - let mask = _mm_set_epi8(0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let mask = _mm_set_epi8( + 0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ); let mut r = _mm_set1_epi8(0); _mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8); let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); @@ -3197,10 +3309,7 @@ mod tests { unsafe fn test_mm_store_si128() { let a = _mm_set1_epi8(9); let mut r = _mm_set1_epi8(0); - _mm_store_si128( - &mut r as *mut _ as *mut __m128i, - a, - ); + _mm_store_si128(&mut r as *mut _ as *mut __m128i, a); assert_eq_m128i(r, a); } @@ -3208,10 +3317,7 @@ mod tests { unsafe fn test_mm_storeu_si128() { let a = _mm_set1_epi8(9); let mut r = _mm_set1_epi8(0); - _mm_storeu_si128( - &mut r as *mut _ as *mut __m128i, - a, - ); + _mm_storeu_si128(&mut r as *mut _ as *mut __m128i, a); assert_eq_m128i(r, a); } @@ -3219,10 +3325,7 @@ mod tests { unsafe fn test_mm_storel_epi64() { let a = _mm_setr_epi64x(2, 9); let mut r = _mm_set1_epi8(0); - _mm_storel_epi64( - &mut r as *mut _ as *mut __m128i, - a, - ); + _mm_storel_epi64(&mut r as *mut _ as *mut __m128i, a); assert_eq_m128i(r, _mm_setr_epi64x(2, 0)); } @@ -3270,7 +3373,7 @@ mod tests { let r = _mm_packs_epi32(a, b); assert_eq_m128i( r, - _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF) + _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF), ); } @@ -3281,7 +3384,7 @@ mod tests { let r = _mm_packus_epi16(a, b); assert_eq_m128i( r, - _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0) + _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0), ); } @@ -3341,8 +3444,11 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_unpackhi_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -3384,15 +3490,21 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_unpacklo_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); let r = _mm_unpacklo_epi8(a, b); - let e = - _mm_setr_epi8(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + #[cfg_attr(rustfmt, rustfmt_skip)] + let e = _mm_setr_epi8( + 0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23, + ); assert_eq_m128i(r, e); } @@ -4043,10 +4155,12 @@ mod tests { assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0)); let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN)); - assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0)); + assert_eq_m128( + r, + _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0), + ); - let r = - _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64)); + let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64)); assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0)); } @@ -4075,10 +4189,7 @@ mod tests { let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN)); assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); - let r = _mm_cvtpd_epi32(_mm_setr_pd( - f64::INFINITY, - f64::NEG_INFINITY, - )); + let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY)); assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0)); let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN)); @@ -4118,8 +4229,8 @@ mod tests { f32::INFINITY, f32::NEG_INFINITY, f32::MAX, - f32::NEG_INFINITY - ) + f32::NEG_INFINITY, + ), ); } @@ -4175,7 +4286,10 @@ mod tests { let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX); let r = _mm_cvttps_epi32(a); - assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN)); + assert_eq_m128i( + r, + _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN), + ); } #[simd_test = "sse2"] diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs index 4ca8397bd8..ab5e509d0d 100644 --- a/coresimd/src/x86/i586/sse41.rs +++ b/coresimd/src/x86/i586/sse41.rs @@ -50,7 +50,9 @@ pub const _MM_FROUND_NEARBYINT: i32 = #[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pblendvb))] -pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { +pub unsafe fn _mm_blendv_epi8( + a: __m128i, b: __m128i, mask: __m128i +) -> __m128i { mem::transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) } @@ -539,13 +541,13 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { /// use coresimd::vendor; /// /// // round to nearest, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEAREST_INT | vendor::_MM_FROUND_NO_EXC); /// // round down, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEG_INF | vendor::_MM_FROUND_NO_EXC); /// // round up, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_POS_INF | vendor::_MM_FROUND_NO_EXC); /// // truncate, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_ZERO | vendor::_MM_FROUND_NO_EXC); /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` @@ -568,13 +570,13 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d { /// use coresimd::vendor; /// /// // round to nearest, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEAREST_INT | vendor::_MM_FROUND_NO_EXC); /// // round down, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEG_INF | vendor::_MM_FROUND_NO_EXC); /// // round up, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_POS_INF | vendor::_MM_FROUND_NO_EXC); /// // truncate, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_ZERO | vendor::_MM_FROUND_NO_EXC); /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` @@ -599,13 +601,13 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 { /// use coresimd::vendor; /// /// // round to nearest, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEAREST_INT | vendor::_MM_FROUND_NO_EXC); /// // round down, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEG_INF | vendor::_MM_FROUND_NO_EXC); /// // round up, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_POS_INF | vendor::_MM_FROUND_NO_EXC); /// // truncate, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_ZERO | vendor::_MM_FROUND_NO_EXC); /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` @@ -630,13 +632,13 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { /// use coresimd::vendor; /// /// // round to nearest, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEAREST_INT | vendor::_MM_FROUND_NO_EXC); /// // round down, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_NEG_INF | vendor::_MM_FROUND_NO_EXC); /// // round up, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_POS_INF | vendor::_MM_FROUND_NO_EXC); /// // truncate, and suppress exceptions: -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// (vendor::_MM_FROUND_TO_ZERO | vendor::_MM_FROUND_NO_EXC); /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: /// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` @@ -689,8 +691,8 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { /// 64-bit integers, and returns the lowest 32-bit, whatever they might be, /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2), /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping -/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a -/// negative number. +/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would +/// return a negative number. #[inline] #[target_feature(enable = "sse4.1")] #[cfg_attr(test, assert_instr(pmulld))] @@ -803,14 +805,20 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn test_mm_blendv_epi8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let b = _mm_setr_epi8( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); - let mask = - _mm_setr_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + #[cfg_attr(rustfmt, rustfmt_skip)] + let mask = _mm_setr_epi8( + 0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1, + ); #[cfg_attr(rustfmt, rustfmt_skip)] let e = _mm_setr_epi8( 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, @@ -876,8 +884,11 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn test_mm_extract_epi8() { - let a = - _mm_setr_epi8(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + -1, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 + ); let r1 = _mm_extract_epi8(a, 0); let r2 = _mm_extract_epi8(a, 19); assert_eq!(r1, 0xFF); @@ -1385,8 +1396,11 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn test_mm_mpsadbw_epu8() { - let a = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); let r = _mm_mpsadbw_epu8(a, a, 0b000); let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); diff --git a/coresimd/src/x86/i586/sse42.rs b/coresimd/src/x86/i586/sse42.rs index f850306d29..e53836efa2 100644 --- a/coresimd/src/x86/i586/sse42.rs +++ b/coresimd/src/x86/i586/sse42.rs @@ -627,7 +627,7 @@ mod tests { slice.get_unchecked_mut(0) as *mut u8 as *mut u8, s.len(), ); - _mm_loadu_si128(slice.as_ptr() as *const _) + _mm_loadu_si128(slice.as_ptr() as *const _) } #[simd_test = "sse4.2"] @@ -689,11 +689,7 @@ mod tests { ); let a = a_bytes; let b = b_bytes; - let i = _mm_cmpistro( - a, - b, - _SIDD_UWORD_OPS | _SIDD_UNIT_MASK, - ); + let i = _mm_cmpistro(a, b, _SIDD_UWORD_OPS | _SIDD_UNIT_MASK); assert_eq!(0, i); } @@ -722,8 +718,7 @@ mod tests { unsafe fn test_mm_cmpestri() { let a = str_to_m128i(b"bar - garbage"); let b = str_to_m128i(b"foobar"); - let i = - _mm_cmpestri(a, 3, b, 6, _SIDD_CMP_EQUAL_ORDERED); + let i = _mm_cmpestri(a, 3, b, 6, _SIDD_CMP_EQUAL_ORDERED); assert_eq!(3, i); } @@ -731,8 +726,7 @@ mod tests { unsafe fn test_mm_cmpestrz() { let a = str_to_m128i(b""); let b = str_to_m128i(b"Hello"); - let i = - _mm_cmpestrz(a, 16, b, 6, _SIDD_CMP_EQUAL_ORDERED); + let i = _mm_cmpestrz(a, 16, b, 6, _SIDD_CMP_EQUAL_ORDERED); assert_eq!(1, i); } @@ -769,13 +763,8 @@ mod tests { unsafe fn test_mm_cmpestra() { let a = str_to_m128i(b"Cannot match a"); let b = str_to_m128i(b"Null after 14"); - let i = _mm_cmpestra( - a, - 14, - b, - 16, - _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK, - ); + let i = + _mm_cmpestra(a, 14, b, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_UNIT_MASK); assert_eq!(1, i); } diff --git a/coresimd/src/x86/i586/ssse3.rs b/coresimd/src/x86/i586/ssse3.rs index 01e461bd79..4efcee388d 100644 --- a/coresimd/src/x86/i586/ssse3.rs +++ b/coresimd/src/x86/i586/ssse3.rs @@ -313,10 +313,18 @@ mod tests { #[simd_test = "ssse3"] unsafe fn test_mm_shuffle_epi8() { - let a = - _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = - _mm_setr_epi8(4, 128u8 as i8, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = _mm_setr_epi8( + 4, 128_u8 as i8, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); let r = _mm_shuffle_epi8(a, b); @@ -325,24 +333,38 @@ mod tests { #[simd_test = "ssse3"] unsafe fn test_mm_alignr_epi8() { - let a = - _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = - _mm_setr_epi8(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = _mm_setr_epi8( + 4, 63, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); let r = _mm_alignr_epi8(a, b, 33); assert_eq_m128i(r, _mm_set1_epi8(0)); let r = _mm_alignr_epi8(a, b, 17); - let expected = - _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let expected = _mm_setr_epi8( + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 0, + ); assert_eq_m128i(r, expected); let r = _mm_alignr_epi8(a, b, 16); assert_eq_m128i(r, a); let r = _mm_alignr_epi8(a, b, 15); - let expected = - _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + #[cfg_attr(rustfmt, rustfmt_skip)] + let expected = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); assert_eq_m128i(r, expected); let r = _mm_alignr_epi8(a, b, 0); @@ -405,10 +427,18 @@ mod tests { #[simd_test = "ssse3"] unsafe fn test_mm_maddubs_epi16() { - let a = - _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - let b = - _mm_setr_epi8(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[cfg_attr(rustfmt, rustfmt_skip)] + let b = _mm_setr_epi8( + 4, 63, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120); let r = _mm_maddubs_epi16(a, b); assert_eq_m128i(r, expected); diff --git a/coresimd/src/x86/i686/mmx.rs b/coresimd/src/x86/i686/mmx.rs index c5d69bcec0..7074f6a28b 100644 --- a/coresimd/src/x86/i686/mmx.rs +++ b/coresimd/src/x86/i686/mmx.rs @@ -514,7 +514,8 @@ mod tests { -30001, i16::max_value() - 1, ); - let e = _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value()); + let e = + _mm_setr_pi16(i16::min_value(), 30000, -30000, i16::max_value()); assert_eq_m64(e, _mm_add_pi16(a, b)); assert_eq_m64(e, _m_paddw(a, b)); } diff --git a/coresimd/src/x86/i686/sse.rs b/coresimd/src/x86/i686/sse.rs index 1cb830eff8..e77f1f3ed6 100644 --- a/coresimd/src/x86/i686/sse.rs +++ b/coresimd/src/x86/i686/sse.rs @@ -1,6 +1,5 @@ //! `i686` Streaming SIMD Extensions (SSE) -use core::mem; use x86::*; #[cfg(test)] @@ -204,7 +203,7 @@ pub unsafe fn _m_psadbw(a: __m64, b: __m64) -> __m64 { #[target_feature(enable = "sse,mmx")] #[cfg_attr(test, assert_instr(cvtpi2ps))] pub unsafe fn _mm_cvtpi32_ps(a: __m128, b: __m64) -> __m128 { - cvtpi2ps(a, mem::transmute(b)) + cvtpi2ps(a, b) } /// Converts two elements of a 64-bit vector of [2 x i32] into two @@ -315,7 +314,7 @@ pub unsafe fn _m_maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8) { #[cfg_attr(test, assert_instr(pextrw, imm2 = 0))] pub unsafe fn _mm_extract_pi16(a: __m64, imm2: i32) -> i16 { macro_rules! call { - ($imm2:expr) => { pextrw(mem::transmute(a), $imm2) as i16 } + ($imm2:expr) => { pextrw(a, $imm2) as i16 } } constify_imm2!(imm2, call) } @@ -359,7 +358,7 @@ pub unsafe fn _m_pinsrw(a: __m64, d: i32, imm2: i32) -> __m64 { #[target_feature(enable = "sse,mmx")] #[cfg_attr(test, assert_instr(pmovmskb))] pub unsafe fn _mm_movemask_pi8(a: __m64) -> i32 { - pmovmskb(mem::transmute(a)) + pmovmskb(a) } /// Takes the most significant bit from each 8-bit element in a 64-bit @@ -399,7 +398,7 @@ pub unsafe fn _m_pshufw(a: __m64, imm8: i32) -> __m64 { #[target_feature(enable = "sse,mmx")] #[cfg_attr(test, assert_instr(cvttps2pi))] pub unsafe fn _mm_cvttps_pi32(a: __m128) -> __m64 { - mem::transmute(cvttps2pi(a)) + cvttps2pi(a) } /// Convert the two lower packed single-precision (32-bit) floating-point @@ -534,8 +533,11 @@ mod tests { #[simd_test = "sse,mmx"] unsafe fn test_mm_sad_pu8() { - let a = _mm_setr_pi8(255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8, - 1, 2, 3, 4); + #[cfg_attr(rustfmt, rustfmt_skip)] + let a = _mm_setr_pi8( + 255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8, + 1, 2, 3, 4, + ); let b = _mm_setr_pi8(0, 0, 0, 0, 2, 1, 2, 1); let r = _mm_sad_pu8(a, b); assert_eq_m64(r, _mm_setr_pi16(1020, 0, 0, 0)); @@ -602,11 +604,7 @@ mod tests { let a = _mm_set1_pi8(9); let mask = _mm_setr_pi8(0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0); let mut r = _mm_set1_pi8(0); - _mm_maskmove_si64( - a, - mask, - &mut r as *mut _ as *mut i8, - ); + _mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8); let e = _mm_setr_pi8(0, 0, 9, 0, 0, 0, 0, 0); assert_eq_m64(r, e); @@ -643,7 +641,8 @@ mod tests { #[simd_test = "sse,mmx"] unsafe fn test_mm_movemask_pi8() { - let a = _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000); + let a = + _mm_setr_pi16(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000); let r = _mm_movemask_pi8(a); assert_eq!(r, 0b10001); diff --git a/coresimd/src/x86/i686/sse2.rs b/coresimd/src/x86/i686/sse2.rs index ba34838c03..41cd8b3578 100644 --- a/coresimd/src/x86/i686/sse2.rs +++ b/coresimd/src/x86/i686/sse2.rs @@ -24,7 +24,7 @@ pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 { #[target_feature(enable = "sse2,mmx")] #[cfg_attr(test, assert_instr(pmuludq))] pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 { - pmuludq(mem::transmute(a), mem::transmute(b)) + pmuludq(a, b) } /// Subtracts signed or unsigned 64-bit integer values and writes the @@ -176,8 +176,7 @@ mod tests { #[simd_test = "sse2,mmx"] unsafe fn test_mm_set_epi64() { - let r = - _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64)); + let r = _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64)); assert_eq_m128i(r, _mm_setr_epi64x(2, 1)); } @@ -189,8 +188,7 @@ mod tests { #[simd_test = "sse2,mmx"] unsafe fn test_mm_setr_epi64() { - let r = - _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64)); + let r = _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64)); assert_eq_m128i(r, _mm_setr_epi64x(1, 2)); } @@ -202,16 +200,7 @@ mod tests { #[simd_test = "sse2,mmx"] unsafe fn test_mm_movpi64_epi64() { - let r = _mm_movpi64_epi64(_mm_setr_pi8( - 5, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - )); + let r = _mm_movpi64_epi64(_mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0)); assert_eq_m128i(r, _mm_setr_epi64x(5, 0)); } diff --git a/coresimd/src/x86/i686/sse42.rs b/coresimd/src/x86/i686/sse42.rs index f092fe412f..ad788861da 100644 --- a/coresimd/src/x86/i686/sse42.rs +++ b/coresimd/src/x86/i686/sse42.rs @@ -27,6 +27,9 @@ mod tests { let a = _mm_setr_epi64x(0, 0x2a); let b = _mm_set1_epi64x(0x00); let i = _mm_cmpgt_epi64(a, b); - assert_eq_m128i(i, _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64)); + assert_eq_m128i( + i, + _mm_setr_epi64x(0x00, 0xffffffffffffffffu64 as i64), + ); } } diff --git a/coresimd/src/x86/mod.rs b/coresimd/src/x86/mod.rs index d1f42f6cc9..27404922ee 100644 --- a/coresimd/src/x86/mod.rs +++ b/coresimd/src/x86/mod.rs @@ -16,6 +16,7 @@ macro_rules! types { #[repr(simd)] pub struct $name($($fields)*); + #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))] impl Clone for $name { #[inline] // currently needed for correctness fn clone(&self) -> $name { @@ -350,7 +351,9 @@ trait m128iExt: Sized { impl m128iExt for __m128i { #[inline] - fn as_m128i(self) -> __m128i { self } + fn as_m128i(self) -> Self { + self + } } #[doc(hidden)] @@ -401,7 +404,9 @@ trait m256iExt: Sized { impl m256iExt for __m256i { #[inline] - fn as_m256i(self) -> __m256i { self } + fn as_m256i(self) -> Self { + self + } } mod i386; diff --git a/coresimd/src/x86/test.rs b/coresimd/src/x86/test.rs index 51763380a9..1ebc8d7406 100644 --- a/coresimd/src/x86/test.rs +++ b/coresimd/src/x86/test.rs @@ -4,13 +4,19 @@ use x86::*; #[target_feature(enable = "mmx")] pub unsafe fn assert_eq_m64(a: __m64, b: __m64) { - union A { a: __m64, b: u64 } + union A { + a: __m64, + b: u64, + } assert_eq!(A { a }.b, A { a: b }.b) } #[target_feature(enable = "sse2")] pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { - union A { a: __m128i, b: [u64; 2] } + union A { + a: __m128i, + b: [u64; 2], + } assert_eq!(A { a }.b, A { a: b }.b) } @@ -23,7 +29,10 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { #[target_feature(enable = "sse2")] pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 { - union A { a: __m128d, b: [f64; 2] }; + union A { + a: __m128d, + b: [f64; 2], + }; A { a }.b[idx] } @@ -37,7 +46,10 @@ pub unsafe fn assert_eq_m128(a: __m128, b: __m128) { #[target_feature(enable = "sse")] pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 { - union A { a: __m128, b: [f32; 4] }; + union A { + a: __m128, + b: [f32; 4], + }; A { a }.b[idx] } @@ -50,7 +62,10 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { #[target_feature(enable = "avx")] pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { - union A { a: __m256i, b: [u64; 4] } + union A { + a: __m256i, + b: [u64; 4], + } assert_eq!(A { a }.b, A { a: b }.b) } @@ -64,7 +79,10 @@ pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) { #[target_feature(enable = "avx")] pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 { - union A { a: __m256d, b: [f64; 4] }; + union A { + a: __m256d, + b: [f64; 4], + }; A { a }.b[idx] } @@ -78,26 +96,37 @@ pub unsafe fn assert_eq_m256(a: __m256, b: __m256) { #[target_feature(enable = "avx")] pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 { - union A { a: __m256, b: [f32; 8] }; + union A { + a: __m256, + b: [f32; 8], + }; A { a }.b[idx] } -// These intrinsics doesn't exist on x86 b/c it requires a 64-bit registe,r which -// doesn't exist on x86! +// These intrinsics doesn't exist on x86 b/c it requires a 64-bit registe,r +// which doesn't exist on x86! #[cfg(target_arch = "x86")] mod x86_polyfill { use x86::*; pub unsafe fn _mm_insert_epi64(a: __m128i, val: i64, idx: i32) -> __m128i { - union A { a: __m128i, b: [i64; 2] }; + union A { + a: __m128i, + b: [i64; 2], + }; let mut a = A { a }; a.b[idx as usize] = val; a.a } #[target_feature(enable = "avx2")] - pub unsafe fn _mm256_insert_epi64(a: __m256i, val: i64, idx: i32) -> __m256i { - union A { a: __m256i, b: [i64; 4] }; + pub unsafe fn _mm256_insert_epi64( + a: __m256i, val: i64, idx: i32 + ) -> __m256i { + union A { + a: __m256i, + b: [i64; 4], + }; let mut a = A { a }; a.b[idx as usize] = val; a.a diff --git a/coresimd/src/x86/x86_64/abm.rs b/coresimd/src/x86/x86_64/abm.rs index 235fa8bb17..510ba41c16 100644 --- a/coresimd/src/x86/x86_64/abm.rs +++ b/coresimd/src/x86/x86_64/abm.rs @@ -1,3 +1,22 @@ +//! Advanced Bit Manipulation (ABM) instructions +//! +//! The POPCNT and LZCNT have their own CPUID bits to indicate support. +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +//! System Instructions][amd64_ref]. +//! +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29 + #[cfg(test)] use stdsimd_test::assert_instr; diff --git a/coresimd/src/x86/x86_64/avx.rs b/coresimd/src/x86/x86_64/avx.rs index 3f9fda1451..33338bfb26 100644 --- a/coresimd/src/x86/x86_64/avx.rs +++ b/coresimd/src/x86/x86_64/avx.rs @@ -1,3 +1,18 @@ +//! Advanced Vector Extensions (AVX) +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture +//! Programmer's Manual, Volume 3: General-Purpose and System +//! Instructions][amd64_ref]. +//! +//! [Wikipedia][wiki] provides a quick overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions + use core::mem; use simd_llvm::*; diff --git a/coresimd/src/x86/x86_64/avx2.rs b/coresimd/src/x86/x86_64/avx2.rs index 6cdb542bfb..4786ef4d5d 100644 --- a/coresimd/src/x86/x86_64/avx2.rs +++ b/coresimd/src/x86/x86_64/avx2.rs @@ -1,3 +1,23 @@ +//! Advanced Vector Extensions 2 (AVX) +//! +//! AVX2 expands most AVX commands to 256-bit wide vector registers and +//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate). +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +//! System Instructions][amd64_ref]. +//! +//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick +//! overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions +//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate + use simd_llvm::*; use x86::*; diff --git a/coresimd/src/x86/x86_64/bmi.rs b/coresimd/src/x86/x86_64/bmi.rs index d8d8a74972..a048851424 100644 --- a/coresimd/src/x86/x86_64/bmi.rs +++ b/coresimd/src/x86/x86_64/bmi.rs @@ -1,3 +1,14 @@ +//! Bit Manipulation Instruction (BMI) Set 1.0. +//! +//! The reference is [Intel 64 and IA-32 Architectures Software Developer's +//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. +//! +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29 + #[cfg(test)] use stdsimd_test::assert_instr; diff --git a/coresimd/src/x86/x86_64/bmi2.rs b/coresimd/src/x86/x86_64/bmi2.rs index b1c74d15c8..aa92133f08 100644 --- a/coresimd/src/x86/x86_64/bmi2.rs +++ b/coresimd/src/x86/x86_64/bmi2.rs @@ -1,3 +1,15 @@ +//! Bit Manipulation Instruction (BMI) Set 2.0. +//! +//! The reference is [Intel 64 and IA-32 Architectures Software Developer's +//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. +//! +//! [Wikipedia][wikipedia_bmi] provides a quick overview of the instructions +//! available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [wikipedia_bmi]: +//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29 + #[cfg(test)] use stdsimd_test::assert_instr; diff --git a/coresimd/src/x86/x86_64/sse.rs b/coresimd/src/x86/x86_64/sse.rs index 4763e81b6f..ece2220daf 100644 --- a/coresimd/src/x86/x86_64/sse.rs +++ b/coresimd/src/x86/x86_64/sse.rs @@ -86,13 +86,9 @@ mod tests { let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); let r = _mm_cvtss_si64(x); assert_eq!( - e, - r, + e, r, "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", - i, - x, - r, - e + i, x, r, e ); } } @@ -118,13 +114,9 @@ mod tests { let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); let r = _mm_cvttss_si64(x); assert_eq!( - e, - r, + e, r, "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", - i, - x, - r, - e + i, x, r, e ); } } diff --git a/coresimd/src/x86/x86_64/sse2.rs b/coresimd/src/x86/x86_64/sse2.rs index ff16d1f957..4136da5677 100644 --- a/coresimd/src/x86/x86_64/sse2.rs +++ b/coresimd/src/x86/x86_64/sse2.rs @@ -120,7 +120,6 @@ mod tests { #[simd_test = "sse2"] unsafe fn test_mm_cvtsd_si64() { - let r = _mm_cvtsd_si64(_mm_setr_pd(-2.0, 5.0)); assert_eq!(r, -2_i64); diff --git a/examples/hex.rs b/examples/hex.rs index 630c6105bd..6da9a6c5e3 100644 --- a/examples/hex.rs +++ b/examples/hex.rs @@ -14,6 +14,10 @@ #![feature(cfg_target_feature, target_feature)] #![cfg_attr(test, feature(test))] +#![cfg_attr(feature = "cargo-clippy", + allow(result_unwrap_used, option_unwrap_used, print_stdout, + missing_docs_in_private_items, shadow_reuse, + cast_possible_wrap, cast_sign_loss))] #[macro_use] extern crate stdsimd; @@ -38,16 +42,16 @@ fn main() { fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { let len = src.len().checked_mul(2).unwrap(); if dst.len() < len { - return Err(len) + return Err(len); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if cfg_feature_enabled!("avx2") { - return unsafe { hex_encode_avx2(src, dst) } + return unsafe { hex_encode_avx2(src, dst) }; } if cfg_feature_enabled!("sse4.1") { - return unsafe { hex_encode_sse41(src, dst) } + return unsafe { hex_encode_sse41(src, dst) }; } } @@ -56,15 +60,15 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { #[target_feature(enable = "avx2")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) - -> Result<&'a str, usize> -{ +unsafe fn hex_encode_avx2<'a>( + mut src: &[u8], dst: &'a mut [u8] +) -> Result<&'a str, usize> { let ascii_zero = _mm256_set1_epi8(b'0' as i8); let nines = _mm256_set1_epi8(9); let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8); let and4bits = _mm256_set1_epi8(0xf); - let mut i = 0isize; + let mut i = 0_isize; while src.len() >= 32 { let invec = _mm256_loadu_si256(src.as_ptr() as *const _); @@ -76,8 +80,14 @@ unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines); // add '0' or the offset depending on the masks - let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); - let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); + let masked1 = _mm256_add_epi8( + masked1, + _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1), + ); + let masked2 = _mm256_add_epi8( + masked2, + _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2), + ); // interleave masked1 and masked2 bytes let res1 = _mm256_unpacklo_epi8(masked2, masked1); @@ -96,23 +106,23 @@ unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) } let i = i as usize; - drop(hex_encode_sse41(src, &mut dst[i * 2..])); + let _ = hex_encode_sse41(src, &mut dst[i * 2..]); - return Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) } // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp #[target_feature(enable = "sse4.1")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) - -> Result<&'a str, usize> -{ +unsafe fn hex_encode_sse41<'a>( + mut src: &[u8], dst: &'a mut [u8] +) -> Result<&'a str, usize> { let ascii_zero = _mm_set1_epi8(b'0' as i8); let nines = _mm_set1_epi8(9); let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); let and4bits = _mm_set1_epi8(0xf); - let mut i = 0isize; + let mut i = 0_isize; while src.len() >= 16 { let invec = _mm_loadu_si128(src.as_ptr() as *const _); @@ -124,8 +134,14 @@ unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); // add '0' or the offset depending on the masks - let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); - let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); + let masked1 = _mm_add_epi8( + masked1, + _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1), + ); + let masked2 = _mm_add_epi8( + masked2, + _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2), + ); // interleave masked1 and masked2 bytes let res1 = _mm_unpacklo_epi8(masked2, masked1); @@ -138,25 +154,25 @@ unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) } let i = i as usize; - drop(hex_encode_fallback(src, &mut dst[i * 2..])); + let _ = hex_encode_fallback(src, &mut dst[i * 2..]); - return Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) + Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) } -fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { - for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { - slots[0] = hex((*byte >> 4) & 0xf); - slots[1] = hex((*byte >> 0) & 0xf); - } - - unsafe { - return Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) - } - +fn hex_encode_fallback<'a>( + src: &[u8], dst: &'a mut [u8] +) -> Result<&'a str, usize> { fn hex(byte: u8) -> u8 { static TABLE: &[u8] = b"0123456789abcdef"; TABLE[byte as usize] } + + for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { + slots[0] = hex((*byte >> 4) & 0xf); + slots[1] = hex(*byte & 0xf); + } + + unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) } } // Run these with `cargo +nightly test --example hex` @@ -175,10 +191,16 @@ mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe { if cfg_feature_enabled!("avx2") { - assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output); + assert_eq!( + hex_encode_avx2(input, &mut tmp()).unwrap(), + output + ); } if cfg_feature_enabled!("sse4.1") { - assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output); + assert_eq!( + hex_encode_sse41(input, &mut tmp()).unwrap(), + output + ); } } } @@ -190,12 +212,18 @@ mod tests { #[test] fn big() { - test(&[0; 1024], &iter::repeat('0').take(2048).collect::()); + test( + &[0; 1024], + &iter::repeat('0').take(2048).collect::(), + ); } #[test] fn odd() { - test(&[0; 313], &iter::repeat('0').take(313 * 2).collect::()); + test( + &[0; 313], + &iter::repeat('0').take(313 * 2).collect::(), + ); } #[test] @@ -206,13 +234,16 @@ mod tests { input[17] = 0x30; input[21] = 1; input[31] = 0x24; - test(&input, "\ - 0000000003000000\ - 0000000000000000\ - 0330000000010000\ - 0000000000000024\ - 00\ - "); + test( + &input, + "\ + 0000000003000000\ + 0000000000000000\ + 0330000000010000\ + 0000000000000024\ + 00\ + ", + ); } quickcheck! { @@ -253,8 +284,8 @@ mod tests { // Run these with `cargo +nightly bench --example hex` #[cfg(test)] mod benches { - extern crate test; extern crate rand; + extern crate test; use self::rand::Rng; @@ -263,10 +294,10 @@ mod benches { const SMALL_LEN: usize = 117; const LARGE_LEN: usize = 1 * 1024 * 1024; - fn doit(b: &mut test::Bencher, - len: usize, - f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>) - { + fn doit( + b: &mut test::Bencher, len: usize, + f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>, + ) { let input = rand::thread_rng() .gen_iter::() .take(len) diff --git a/src/lib.rs b/src/lib.rs index e610631265..b2c521cb41 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -85,7 +85,7 @@ //! unsafe { sum_sse2(x) } //! } //! #[cfg(not(all(any(target_arch = "x86_64", target_arch = "x86"), -//! target_feature = "sse2")))] +//! target_feature = "sse2")))] //! { //! sum_portable(x) //! } diff --git a/src/runtime/linux/mod.rs b/src/runtime/linux/mod.rs index 9ff760b12d..fe685f9916 100644 --- a/src/runtime/linux/mod.rs +++ b/src/runtime/linux/mod.rs @@ -17,7 +17,7 @@ mod auxv; mod cpuinfo; /// Detects CPU features: -pub fn detect_features() -> usize { +pub fn detect_features() -> cache::Initializer { // Try to read the ELF Auxiliary Vector using libc's getauxval: if let Ok(v) = auxv::libc::auxv() { return arch::detect_features(v); @@ -31,7 +31,7 @@ pub fn detect_features() -> usize { return arch::detect_features(v); } // Otherwise all features are disabled - 0 + cache::Initializer::default() } /// Performs run-time feature detection. diff --git a/stdsimd-test/assert-instr-macro/src/lib.rs b/stdsimd-test/assert-instr-macro/src/lib.rs index c9e2850944..545449c9a7 100644 --- a/stdsimd-test/assert-instr-macro/src/lib.rs +++ b/stdsimd-test/assert-instr-macro/src/lib.rs @@ -10,8 +10,8 @@ #![feature(proc_macro)] -extern crate proc_macro2; extern crate proc_macro; +extern crate proc_macro2; #[macro_use] extern crate quote; #[macro_use] diff --git a/stdsimd-test/simd-test-macro/src/lib.rs b/stdsimd-test/simd-test-macro/src/lib.rs index 6f9ddc474d..1044d44858 100644 --- a/stdsimd-test/simd-test-macro/src/lib.rs +++ b/stdsimd-test/simd-test-macro/src/lib.rs @@ -5,8 +5,8 @@ #![feature(proc_macro)] -extern crate proc_macro2; extern crate proc_macro; +extern crate proc_macro2; #[macro_use] extern crate quote; diff --git a/stdsimd-test/src/lib.rs b/stdsimd-test/src/lib.rs index 1ada9193e7..60c9e3e073 100644 --- a/stdsimd-test/src/lib.rs +++ b/stdsimd-test/src/lib.rs @@ -279,7 +279,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { let function = &functions[0]; let mut instrs = &function.instrs[..]; - while instrs.last().map(|s| s.parts == ["nop"]).unwrap_or(false) { + while instrs.last().map_or(false, |s| s.parts == ["nop"]) { instrs = &instrs[..instrs.len() - 1]; } @@ -346,8 +346,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) { _ => 20, }; - let probably_only_one_instruction = - instrs.len() < instruction_limit; + let probably_only_one_instruction = instrs.len() < instruction_limit; if found && probably_only_one_instruction && !inlining_failed { return; diff --git a/stdsimd-verify/src/lib.rs b/stdsimd-verify/src/lib.rs index 2682a55adc..5e55e9d296 100644 --- a/stdsimd-verify/src/lib.rs +++ b/stdsimd-verify/src/lib.rs @@ -1,7 +1,7 @@ #![feature(proc_macro)] -extern crate proc_macro2; extern crate proc_macro; +extern crate proc_macro2; #[macro_use] extern crate quote; extern crate syn; @@ -158,7 +158,7 @@ fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) { } if path.file_name().and_then(|s| s.to_str()) == Some("test.rs") { - continue + continue; } let mut contents = String::new(); @@ -220,23 +220,19 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option { _ => None, }) .flat_map(|list| list) - .filter_map(|nested| { - match nested { - syn::NestedMeta::Meta(m) => Some(m), - syn::NestedMeta::Literal(_) => None, - } + .filter_map(|nested| match nested { + syn::NestedMeta::Meta(m) => Some(m), + syn::NestedMeta::Literal(_) => None, }) - .filter_map(|m| { - match m { - syn::Meta::NameValue(i) => { - if i.ident == "enable" { - Some(i.lit) - } else { - None - } + .filter_map(|m| match m { + syn::Meta::NameValue(i) => { + if i.ident == "enable" { + Some(i.lit) + } else { + None } - _ => None, } + _ => None, }) .next() } diff --git a/stdsimd-verify/tests/x86-intel.rs b/stdsimd-verify/tests/x86-intel.rs index 41d66c07c6..0f49ee9854 100644 --- a/stdsimd-verify/tests/x86-intel.rs +++ b/stdsimd-verify/tests/x86-intel.rs @@ -1,14 +1,16 @@ #![feature(proc_macro)] #![allow(bad_style)] #![cfg_attr(feature = "cargo-clippy", - allow(shadow_reuse, cast_lossless, match_same_arms))] + allow(shadow_reuse, cast_lossless, match_same_arms, + nonminimal_bool, print_stdout, use_debug, eq_op, + useless_format))] #[macro_use] extern crate serde_derive; extern crate serde_xml_rs; extern crate stdsimd_verify; -use std::collections::{HashMap, BTreeMap}; +use std::collections::{BTreeMap, HashMap}; use stdsimd_verify::x86_functions; @@ -70,21 +72,26 @@ x86_functions!(static FUNCTIONS); #[derive(Deserialize)] struct Data { - #[serde(rename = "intrinsic", default)] intrinsics: Vec, + #[serde(rename = "intrinsic", default)] + intrinsics: Vec, } #[derive(Deserialize)] struct Intrinsic { rettype: String, name: String, - #[serde(rename = "CPUID", default)] cpuid: Vec, - #[serde(rename = "parameter", default)] parameters: Vec, - #[serde(default)] instruction: Vec, + #[serde(rename = "CPUID", default)] + cpuid: Vec, + #[serde(rename = "parameter", default)] + parameters: Vec, + #[serde(default)] + instruction: Vec, } #[derive(Deserialize)] struct Parameter { - #[serde(rename = "type")] type_: String, + #[serde(rename = "type")] + type_: String, } #[derive(Deserialize, Debug)] @@ -113,22 +120,21 @@ fn verify_all_signatures() { serde_xml_rs::deserialize(xml).expect("failed to deserialize xml"); let mut map = HashMap::new(); for intrinsic in &data.intrinsics { - map.entry(&intrinsic.name[..]).or_insert(Vec::new()).push(intrinsic); + map.entry(&intrinsic.name[..]) + .or_insert_with(Vec::new) + .push(intrinsic); } let mut all_valid = true; - 'outer: - for rust in FUNCTIONS { + 'outer: for rust in FUNCTIONS { match rust.name { - // These aren't defined by Intel but they're defined by what appears - // to be all other compilers. For more information see - // rust-lang-nursery/stdsimd#307, and otherwise these signatures - // have all been manually verified. - "__readeflags" | - "__writeeflags" | - "__cpuid_count" | - "__cpuid" | - "__get_cpuid_max" => continue, + // These aren't defined by Intel but they're defined by what + // appears to be all other compilers. For more + // information see rust-lang-nursery/stdsimd#307, and + // otherwise these signatures have all been manually + // verified. + "__readeflags" | "__writeeflags" | "__cpuid_count" | "__cpuid" + | "__get_cpuid_max" => continue, _ => {} } @@ -147,7 +153,7 @@ fn verify_all_signatures() { let mut errors = Vec::new(); for intel in intel { - match matches(rust, &intel) { + match matches(rust, intel) { Ok(()) => continue 'outer, Err(e) => errors.push(e), } @@ -161,11 +167,11 @@ fn verify_all_signatures() { assert!(all_valid); let mut missing = BTreeMap::new(); - for (name, intel) in map.iter() { + for (name, intel) in &map { // currently focused mainly on missing SIMD intrinsics, but there's // definitely some other assorted ones that we're missing. if !name.starts_with("_mm") { - continue + continue; } // we'll get to avx-512 later @@ -179,8 +185,9 @@ fn verify_all_signatures() { // } for intel in intel { - missing.entry(&intel.cpuid) - .or_insert(Vec::new()) + missing + .entry(&intel.cpuid) + .or_insert_with(Vec::new) .push(intel); } } @@ -191,8 +198,11 @@ fn verify_all_signatures() { if PRINT_MISSING_LISTS_MARKDOWN { println!("\n
{:?}

\n", k); for intel in v { - let url = format!("https://software.intel.com/sites/landingpage\ - /IntrinsicsGuide/#text={}&expand=5236", intel.name); + let url = format!( + "https://software.intel.com/sites/landingpage\ + /IntrinsicsGuide/#text={}&expand=5236", + intel.name + ); println!(" * [ ] [`{}`]({})", intel.name, url); } println!("

\n"); @@ -251,7 +261,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { let rust_feature = rust.target_feature .expect(&format!("no target feature listed for {}", rust.name)); if rust_feature.contains(&cpuid) { - continue + continue; } bail!( "intel cpuid `{}` not in `{}` for {}", @@ -263,9 +273,11 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { if PRINT_INSTRUCTION_VIOLATIONS { if rust.instrs.is_empty() { - if intel.instruction.len() > 0 { - println!("instruction not listed for `{}`, but intel lists {:?}", - rust.name, intel.instruction); + if !intel.instruction.is_empty() { + println!( + "instruction not listed for `{}`, but intel lists {:?}", + rust.name, intel.instruction + ); } // If intel doesn't list any instructions and we do then don't @@ -280,8 +292,7 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { if !asserting { println!( "intel failed to list `{}` as an instruction for `{}`", - instr, - rust.name + instr, rust.name ); } } @@ -315,51 +326,37 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { } } - let any_i64 = rust.arguments.iter() - .cloned() - .chain(rust.ret) - .any(|arg| { - match *arg { - Type::PrimSigned(64) | - Type::PrimUnsigned(64) => true, - _ => false, - } - }); + let any_i64 = rust.arguments.iter().cloned().chain(rust.ret).any(|arg| { + match *arg { + Type::PrimSigned(64) | Type::PrimUnsigned(64) => true, + _ => false, + } + }); let any_i64_exempt = match rust.name { // These intrinsics have all been manually verified against Clang's // headers to be available on x86, and the u64 arguments seem // spurious I guess? - "_xsave" | - "_xrstor" | - "_xsetbv" | - "_xgetbv" | - "_xsaveopt" | - "_xsavec" | - "_xsaves" | - "_xrstors" => true, + "_xsave" | "_xrstor" | "_xsetbv" | "_xgetbv" | "_xsaveopt" + | "_xsavec" | "_xsaves" | "_xrstors" => true, // Apparently all of clang/msvc/gcc accept these intrinsics on // 32-bit, so let's do the same - "_mm_set_epi64x" | - "_mm_set1_epi64x" | - "_mm256_set_epi64x" | - "_mm256_setr_epi64x" | - "_mm256_set1_epi64x" => true, + "_mm_set_epi64x" | "_mm_set1_epi64x" | "_mm256_set_epi64x" + | "_mm256_setr_epi64x" | "_mm256_set1_epi64x" => true, // These return a 64-bit argument but they're assembled from other // 32-bit registers, so these work on 32-bit just fine. See #308 for // more info. - "_rdtsc" | - "__rdtscp" => true, + "_rdtsc" | "__rdtscp" => true, _ => false, }; - if any_i64 && !any_i64_exempt { - if !rust.file.contains("x86_64") { - bail!("intrinsic `{}` uses a 64-bit bare type but may be \ - available on 32-bit platforms", - rust.name) - } + if any_i64 && !any_i64_exempt && !rust.file.contains("x86_64") { + bail!( + "intrinsic `{}` uses a 64-bit bare type but may be \ + available on 32-bit platforms", + rust.name + ) } Ok(()) } @@ -394,8 +391,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str) -> Result<(), String> { (&Type::Ptr(&Type::PrimUnsigned(8)), "const void*") => {} (&Type::Ptr(&Type::PrimUnsigned(8)), "void*") => {} - (&Type::M64, "__m64") - | (&Type::Ptr(&Type::M64), "__m64*") => {} + (&Type::M64, "__m64") | (&Type::Ptr(&Type::M64), "__m64*") => {} (&Type::M128I, "__m128i") | (&Type::Ptr(&Type::M128I), "__m128i*") @@ -435,12 +431,12 @@ fn equate(t: &Type, intel: &str, intrinsic: &str) -> Result<(), String> { if intrinsic.starts_with("_mm_ucomi") && intrinsic.ends_with("_sd") => {} - _ => { - bail!( - "failed to equate: `{}` and {:?} for {}", - intel, t, intrinsic - ) - } + _ => bail!( + "failed to equate: `{}` and {:?} for {}", + intel, + t, + intrinsic + ), } Ok(()) } diff --git a/tests/cpu-detection.rs b/tests/cpu-detection.rs index 3efb61ab74..3af2a320d9 100644 --- a/tests/cpu-detection.rs +++ b/tests/cpu-detection.rs @@ -1,6 +1,7 @@ #![feature(cfg_target_feature)] #![cfg_attr(feature = "strict", deny(warnings))] -#![cfg_attr(feature = "cargo-clippy", allow(option_unwrap_used))] +#![cfg_attr(feature = "cargo-clippy", + allow(option_unwrap_used, use_debug, print_stdout))] #[cfg(any(target_arch = "arm", target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64",