core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
34pub const fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35    unsafe { simd_add(a, b) }
36}
37
38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
39/// `b`.
40///
41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
45#[stable(feature = "simd_x86", since = "1.27.0")]
46#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
47pub const fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
48    unsafe { simd_add(a, b) }
49}
50
51/// Computes the bitwise AND of a packed double-precision (64-bit)
52/// floating-point elements in `a` and `b`.
53///
54/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
55#[inline]
56#[target_feature(enable = "avx")]
57// See https://github.com/rust-lang/stdarch/issues/71
58#[cfg_attr(test, assert_instr(vandp))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
62    unsafe {
63        let a: u64x4 = transmute(a);
64        let b: u64x4 = transmute(b);
65        transmute(simd_and(a, b))
66    }
67}
68
69/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
70/// elements in `a` and `b`.
71///
72/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
73#[inline]
74#[target_feature(enable = "avx")]
75#[cfg_attr(test, assert_instr(vandps))]
76#[stable(feature = "simd_x86", since = "1.27.0")]
77#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
78pub const fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
79    unsafe {
80        let a: u32x8 = transmute(a);
81        let b: u32x8 = transmute(b);
82        transmute(simd_and(a, b))
83    }
84}
85
86/// Computes the bitwise OR packed double-precision (64-bit) floating-point
87/// elements in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
90#[inline]
91#[target_feature(enable = "avx")]
92// See <https://github.com/rust-lang/stdarch/issues/71>.
93#[cfg_attr(test, assert_instr(vorp))]
94#[stable(feature = "simd_x86", since = "1.27.0")]
95#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
96pub const fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
97    unsafe {
98        let a: u64x4 = transmute(a);
99        let b: u64x4 = transmute(b);
100        transmute(simd_or(a, b))
101    }
102}
103
104/// Computes the bitwise OR packed single-precision (32-bit) floating-point
105/// elements in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
108#[inline]
109#[target_feature(enable = "avx")]
110#[cfg_attr(test, assert_instr(vorps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
114    unsafe {
115        let a: u32x8 = transmute(a);
116        let b: u32x8 = transmute(b);
117        transmute(simd_or(a, b))
118    }
119}
120
121/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
122/// lanes using the control in `imm8`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
125#[inline]
126#[target_feature(enable = "avx")]
127#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
128#[rustc_legacy_const_generics(2)]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
132    static_assert_uimm_bits!(MASK, 8);
133    unsafe {
134        simd_shuffle!(
135            a,
136            b,
137            [
138                MASK as u32 & 0b1,
139                ((MASK as u32 >> 1) & 0b1) + 4,
140                ((MASK as u32 >> 2) & 0b1) + 2,
141                ((MASK as u32 >> 3) & 0b1) + 6,
142            ],
143        )
144    }
145}
146
147/// Shuffles single-precision (32-bit) floating-point elements in `a` within
148/// 128-bit lanes using the control in `imm8`.
149///
150/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
151#[inline]
152#[target_feature(enable = "avx")]
153#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
154#[rustc_legacy_const_generics(2)]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
157pub const fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
158    static_assert_uimm_bits!(MASK, 8);
159    unsafe {
160        simd_shuffle!(
161            a,
162            b,
163            [
164                MASK as u32 & 0b11,
165                (MASK as u32 >> 2) & 0b11,
166                ((MASK as u32 >> 4) & 0b11) + 8,
167                ((MASK as u32 >> 6) & 0b11) + 8,
168                (MASK as u32 & 0b11) + 4,
169                ((MASK as u32 >> 2) & 0b11) + 4,
170                ((MASK as u32 >> 4) & 0b11) + 12,
171                ((MASK as u32 >> 6) & 0b11) + 12,
172            ],
173        )
174    }
175}
176
177/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
178/// elements in `a`, and then AND with `b`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
181#[inline]
182#[target_feature(enable = "avx")]
183#[cfg_attr(test, assert_instr(vandnp))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
186pub const fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
187    unsafe {
188        let a: u64x4 = transmute(a);
189        let b: u64x4 = transmute(b);
190        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
191    }
192}
193
194/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
195/// elements in `a`
196/// and then AND with `b`.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
199#[inline]
200#[target_feature(enable = "avx")]
201#[cfg_attr(test, assert_instr(vandnps))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
204pub const fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
205    unsafe {
206        let a: u32x8 = transmute(a);
207        let b: u32x8 = transmute(b);
208        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
209    }
210}
211
212/// Compares packed double-precision (64-bit) floating-point elements
213/// in `a` and `b`, and returns packed maximum values
214///
215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
216#[inline]
217#[target_feature(enable = "avx")]
218#[cfg_attr(test, assert_instr(vmaxpd))]
219#[stable(feature = "simd_x86", since = "1.27.0")]
220pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
221    unsafe { vmaxpd(a, b) }
222}
223
224/// Compares packed single-precision (32-bit) floating-point elements in `a`
225/// and `b`, and returns packed maximum values
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
228#[inline]
229#[target_feature(enable = "avx")]
230#[cfg_attr(test, assert_instr(vmaxps))]
231#[stable(feature = "simd_x86", since = "1.27.0")]
232pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
233    unsafe { vmaxps(a, b) }
234}
235
236/// Compares packed double-precision (64-bit) floating-point elements
237/// in `a` and `b`, and returns packed minimum values
238///
239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
240#[inline]
241#[target_feature(enable = "avx")]
242#[cfg_attr(test, assert_instr(vminpd))]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
245    unsafe { vminpd(a, b) }
246}
247
248/// Compares packed single-precision (32-bit) floating-point elements in `a`
249/// and `b`, and returns packed minimum values
250///
251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
252#[inline]
253#[target_feature(enable = "avx")]
254#[cfg_attr(test, assert_instr(vminps))]
255#[stable(feature = "simd_x86", since = "1.27.0")]
256pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
257    unsafe { vminps(a, b) }
258}
259
260/// Multiplies packed double-precision (64-bit) floating-point elements
261/// in `a` and `b`.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
264#[inline]
265#[target_feature(enable = "avx")]
266#[cfg_attr(test, assert_instr(vmulpd))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
269pub const fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
270    unsafe { simd_mul(a, b) }
271}
272
273/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
274/// `b`.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
277#[inline]
278#[target_feature(enable = "avx")]
279#[cfg_attr(test, assert_instr(vmulps))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
282pub const fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
283    unsafe { simd_mul(a, b) }
284}
285
286/// Alternatively adds and subtracts packed double-precision (64-bit)
287/// floating-point elements in `a` to/from packed elements in `b`.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vaddsubpd))]
293#[stable(feature = "simd_x86", since = "1.27.0")]
294#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
295pub const fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
296    unsafe {
297        let a = a.as_f64x4();
298        let b = b.as_f64x4();
299        let add = simd_add(a, b);
300        let sub = simd_sub(a, b);
301        simd_shuffle!(add, sub, [4, 1, 6, 3])
302    }
303}
304
305/// Alternatively adds and subtracts packed single-precision (32-bit)
306/// floating-point elements in `a` to/from packed elements in `b`.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
309#[inline]
310#[target_feature(enable = "avx")]
311#[cfg_attr(test, assert_instr(vaddsubps))]
312#[stable(feature = "simd_x86", since = "1.27.0")]
313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
314pub const fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
315    unsafe {
316        let a = a.as_f32x8();
317        let b = b.as_f32x8();
318        let add = simd_add(a, b);
319        let sub = simd_sub(a, b);
320        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
321    }
322}
323
324/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
325/// from packed elements in `a`.
326///
327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
328#[inline]
329#[target_feature(enable = "avx")]
330#[cfg_attr(test, assert_instr(vsubpd))]
331#[stable(feature = "simd_x86", since = "1.27.0")]
332#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
333pub const fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
334    unsafe { simd_sub(a, b) }
335}
336
337/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
338/// from packed elements in `a`.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
341#[inline]
342#[target_feature(enable = "avx")]
343#[cfg_attr(test, assert_instr(vsubps))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
346pub const fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
347    unsafe { simd_sub(a, b) }
348}
349
350/// Computes the division of each of the 8 packed 32-bit floating-point elements
351/// in `a` by the corresponding packed elements in `b`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
354#[inline]
355#[target_feature(enable = "avx")]
356#[cfg_attr(test, assert_instr(vdivps))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
359pub const fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
360    unsafe { simd_div(a, b) }
361}
362
363/// Computes the division of each of the 4 packed 64-bit floating-point elements
364/// in `a` by the corresponding packed elements in `b`.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
367#[inline]
368#[target_feature(enable = "avx")]
369#[cfg_attr(test, assert_instr(vdivpd))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
372pub const fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
373    unsafe { simd_div(a, b) }
374}
375
376/// Rounds packed double-precision (64-bit) floating point elements in `a`
377/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
378///
379/// - `0x00`: Round to the nearest whole number.
380/// - `0x01`: Round down, toward negative infinity.
381/// - `0x02`: Round up, toward positive infinity.
382/// - `0x03`: Truncate the values.
383///
384/// For a complete list of options, check [the LLVM docs][llvm_docs].
385///
386/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
387///
388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
389#[inline]
390#[target_feature(enable = "avx")]
391#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
392#[rustc_legacy_const_generics(1)]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
395    static_assert_uimm_bits!(ROUNDING, 4);
396    unsafe { roundpd256(a, ROUNDING) }
397}
398
399/// Rounds packed double-precision (64-bit) floating point elements in `a`
400/// toward positive infinity.
401///
402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
403#[inline]
404#[target_feature(enable = "avx")]
405#[cfg_attr(test, assert_instr(vroundpd))]
406#[stable(feature = "simd_x86", since = "1.27.0")]
407#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
408pub const fn _mm256_ceil_pd(a: __m256d) -> __m256d {
409    unsafe { simd_ceil(a) }
410}
411
412/// Rounds packed double-precision (64-bit) floating point elements in `a`
413/// toward negative infinity.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
416#[inline]
417#[target_feature(enable = "avx")]
418#[cfg_attr(test, assert_instr(vroundpd))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
421pub const fn _mm256_floor_pd(a: __m256d) -> __m256d {
422    unsafe { simd_floor(a) }
423}
424
425/// Rounds packed single-precision (32-bit) floating point elements in `a`
426/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
427///
428/// - `0x00`: Round to the nearest whole number.
429/// - `0x01`: Round down, toward negative infinity.
430/// - `0x02`: Round up, toward positive infinity.
431/// - `0x03`: Truncate the values.
432///
433/// For a complete list of options, check [the LLVM docs][llvm_docs].
434///
435/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
438#[inline]
439#[target_feature(enable = "avx")]
440#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
441#[rustc_legacy_const_generics(1)]
442#[stable(feature = "simd_x86", since = "1.27.0")]
443pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
444    static_assert_uimm_bits!(ROUNDING, 4);
445    unsafe { roundps256(a, ROUNDING) }
446}
447
448/// Rounds packed single-precision (32-bit) floating point elements in `a`
449/// toward positive infinity.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
452#[inline]
453#[target_feature(enable = "avx")]
454#[cfg_attr(test, assert_instr(vroundps))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
457pub const fn _mm256_ceil_ps(a: __m256) -> __m256 {
458    unsafe { simd_ceil(a) }
459}
460
461/// Rounds packed single-precision (32-bit) floating point elements in `a`
462/// toward negative infinity.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
465#[inline]
466#[target_feature(enable = "avx")]
467#[cfg_attr(test, assert_instr(vroundps))]
468#[stable(feature = "simd_x86", since = "1.27.0")]
469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
470pub const fn _mm256_floor_ps(a: __m256) -> __m256 {
471    unsafe { simd_floor(a) }
472}
473
474/// Returns the square root of packed single-precision (32-bit) floating point
475/// elements in `a`.
476///
477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
478#[inline]
479#[target_feature(enable = "avx")]
480#[cfg_attr(test, assert_instr(vsqrtps))]
481#[stable(feature = "simd_x86", since = "1.27.0")]
482pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
483    unsafe { simd_fsqrt(a) }
484}
485
486/// Returns the square root of packed double-precision (64-bit) floating point
487/// elements in `a`.
488///
489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
490#[inline]
491#[target_feature(enable = "avx")]
492#[cfg_attr(test, assert_instr(vsqrtpd))]
493#[stable(feature = "simd_x86", since = "1.27.0")]
494pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
495    unsafe { simd_fsqrt(a) }
496}
497
498/// Blends packed double-precision (64-bit) floating-point elements from
499/// `a` and `b` using control mask `imm8`.
500///
501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
502#[inline]
503#[target_feature(enable = "avx")]
504// Note: LLVM7 prefers single-precision blend instructions when
505// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
506// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
507#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
508#[rustc_legacy_const_generics(2)]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
511pub const fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
512    static_assert_uimm_bits!(IMM4, 4);
513    unsafe {
514        simd_shuffle!(
515            a,
516            b,
517            [
518                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
519                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
520                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
521                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
522            ],
523        )
524    }
525}
526
527/// Blends packed single-precision (32-bit) floating-point elements from
528/// `a` and `b` using control mask `imm8`.
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
531#[inline]
532#[target_feature(enable = "avx")]
533#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
534#[rustc_legacy_const_generics(2)]
535#[stable(feature = "simd_x86", since = "1.27.0")]
536#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
537pub const fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
538    static_assert_uimm_bits!(IMM8, 8);
539    unsafe {
540        simd_shuffle!(
541            a,
542            b,
543            [
544                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
545                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
546                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
547                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
548                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
549                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
550                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
551                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
552            ],
553        )
554    }
555}
556
557/// Blends packed double-precision (64-bit) floating-point elements from
558/// `a` and `b` using `c` as a mask.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
561#[inline]
562#[target_feature(enable = "avx")]
563#[cfg_attr(test, assert_instr(vblendvpd))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
566pub const fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
567    unsafe {
568        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
569        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
570    }
571}
572
573/// Blends packed single-precision (32-bit) floating-point elements from
574/// `a` and `b` using `c` as a mask.
575///
576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
577#[inline]
578#[target_feature(enable = "avx")]
579#[cfg_attr(test, assert_instr(vblendvps))]
580#[stable(feature = "simd_x86", since = "1.27.0")]
581#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
582pub const fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
583    unsafe {
584        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
585        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
586    }
587}
588
589/// Conditionally multiplies the packed single-precision (32-bit) floating-point
590/// elements in `a` and `b` using the high 4 bits in `imm8`,
591/// sum the four products, and conditionally return the sum
592///  using the low 4 bits of `imm8`.
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
598#[rustc_legacy_const_generics(2)]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
601    static_assert_uimm_bits!(IMM8, 8);
602    unsafe { vdpps(a, b, IMM8 as i8) }
603}
604
605/// Horizontal addition of adjacent pairs in the two packed vectors
606/// of 4 64-bit floating points `a` and `b`.
607/// In the result, sums of elements from `a` are returned in even locations,
608/// while sums of elements from `b` are returned in odd locations.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
611#[inline]
612#[target_feature(enable = "avx")]
613#[cfg_attr(test, assert_instr(vhaddpd))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
616pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617    unsafe {
618        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620        simd_add(even, odd)
621    }
622}
623
624/// Horizontal addition of adjacent pairs in the two packed vectors
625/// of 8 32-bit floating points `a` and `b`.
626/// In the result, sums of elements from `a` are returned in locations of
627/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
628/// 2, 3, 6, 7.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
631#[inline]
632#[target_feature(enable = "avx")]
633#[cfg_attr(test, assert_instr(vhaddps))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
636pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637    unsafe {
638        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640        simd_add(even, odd)
641    }
642}
643
644/// Horizontal subtraction of adjacent pairs in the two packed vectors
645/// of 4 64-bit floating points `a` and `b`.
646/// In the result, sums of elements from `a` are returned in even locations,
647/// while sums of elements from `b` are returned in odd locations.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
650#[inline]
651#[target_feature(enable = "avx")]
652#[cfg_attr(test, assert_instr(vhsubpd))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
655pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656    unsafe {
657        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659        simd_sub(even, odd)
660    }
661}
662
663/// Horizontal subtraction of adjacent pairs in the two packed vectors
664/// of 8 32-bit floating points `a` and `b`.
665/// In the result, sums of elements from `a` are returned in locations of
666/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
667/// 2, 3, 6, 7.
668///
669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
670#[inline]
671#[target_feature(enable = "avx")]
672#[cfg_attr(test, assert_instr(vhsubps))]
673#[stable(feature = "simd_x86", since = "1.27.0")]
674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
675pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676    unsafe {
677        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679        simd_sub(even, odd)
680    }
681}
682
683/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
684/// elements in `a` and `b`.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
687#[inline]
688#[target_feature(enable = "avx")]
689#[cfg_attr(test, assert_instr(vxorp))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
692pub const fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
693    unsafe {
694        let a: u64x4 = transmute(a);
695        let b: u64x4 = transmute(b);
696        transmute(simd_xor(a, b))
697    }
698}
699
700/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
701/// elements in `a` and `b`.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
704#[inline]
705#[target_feature(enable = "avx")]
706#[cfg_attr(test, assert_instr(vxorps))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
710    unsafe {
711        let a: u32x8 = transmute(a);
712        let b: u32x8 = transmute(b);
713        transmute(simd_xor(a, b))
714    }
715}
716
717/// Equal (ordered, non-signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OQ: i32 = 0x00;
720/// Less-than (ordered, signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OS: i32 = 0x01;
723/// Less-than-or-equal (ordered, signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OS: i32 = 0x02;
726/// Unordered (non-signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_Q: i32 = 0x03;
729/// Not-equal (unordered, non-signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_UQ: i32 = 0x04;
732/// Not-less-than (unordered, signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_US: i32 = 0x05;
735/// Not-less-than-or-equal (unordered, signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_US: i32 = 0x06;
738/// Ordered (non-signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_Q: i32 = 0x07;
741/// Equal (unordered, non-signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_UQ: i32 = 0x08;
744/// Not-greater-than-or-equal (unordered, signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_US: i32 = 0x09;
747/// Not-greater-than (unordered, signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_US: i32 = 0x0a;
750/// False (ordered, non-signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OQ: i32 = 0x0b;
753/// Not-equal (ordered, non-signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OQ: i32 = 0x0c;
756/// Greater-than-or-equal (ordered, signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OS: i32 = 0x0d;
759/// Greater-than (ordered, signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OS: i32 = 0x0e;
762/// True (unordered, non-signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_UQ: i32 = 0x0f;
765/// Equal (ordered, signaling)
766#[stable(feature = "simd_x86", since = "1.27.0")]
767pub const _CMP_EQ_OS: i32 = 0x10;
768/// Less-than (ordered, non-signaling)
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub const _CMP_LT_OQ: i32 = 0x11;
771/// Less-than-or-equal (ordered, non-signaling)
772#[stable(feature = "simd_x86", since = "1.27.0")]
773pub const _CMP_LE_OQ: i32 = 0x12;
774/// Unordered (signaling)
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub const _CMP_UNORD_S: i32 = 0x13;
777/// Not-equal (unordered, signaling)
778#[stable(feature = "simd_x86", since = "1.27.0")]
779pub const _CMP_NEQ_US: i32 = 0x14;
780/// Not-less-than (unordered, non-signaling)
781#[stable(feature = "simd_x86", since = "1.27.0")]
782pub const _CMP_NLT_UQ: i32 = 0x15;
783/// Not-less-than-or-equal (unordered, non-signaling)
784#[stable(feature = "simd_x86", since = "1.27.0")]
785pub const _CMP_NLE_UQ: i32 = 0x16;
786/// Ordered (signaling)
787#[stable(feature = "simd_x86", since = "1.27.0")]
788pub const _CMP_ORD_S: i32 = 0x17;
789/// Equal (unordered, signaling)
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub const _CMP_EQ_US: i32 = 0x18;
792/// Not-greater-than-or-equal (unordered, non-signaling)
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub const _CMP_NGE_UQ: i32 = 0x19;
795/// Not-greater-than (unordered, non-signaling)
796#[stable(feature = "simd_x86", since = "1.27.0")]
797pub const _CMP_NGT_UQ: i32 = 0x1a;
798/// False (ordered, signaling)
799#[stable(feature = "simd_x86", since = "1.27.0")]
800pub const _CMP_FALSE_OS: i32 = 0x1b;
801/// Not-equal (ordered, signaling)
802#[stable(feature = "simd_x86", since = "1.27.0")]
803pub const _CMP_NEQ_OS: i32 = 0x1c;
804/// Greater-than-or-equal (ordered, non-signaling)
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub const _CMP_GE_OQ: i32 = 0x1d;
807/// Greater-than (ordered, non-signaling)
808#[stable(feature = "simd_x86", since = "1.27.0")]
809pub const _CMP_GT_OQ: i32 = 0x1e;
810/// True (unordered, signaling)
811#[stable(feature = "simd_x86", since = "1.27.0")]
812pub const _CMP_TRUE_US: i32 = 0x1f;
813
814/// Compares packed double-precision (64-bit) floating-point
815/// elements in `a` and `b` based on the comparison operand
816/// specified by `IMM5`.
817///
818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
819#[inline]
820#[target_feature(enable = "avx")]
821#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
822#[rustc_legacy_const_generics(2)]
823#[stable(feature = "simd_x86", since = "1.27.0")]
824pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
825    static_assert_uimm_bits!(IMM5, 5);
826    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
827}
828
829/// Compares packed double-precision (64-bit) floating-point
830/// elements in `a` and `b` based on the comparison operand
831/// specified by `IMM5`.
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
834#[inline]
835#[target_feature(enable = "avx")]
836#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
837#[rustc_legacy_const_generics(2)]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
840    static_assert_uimm_bits!(IMM5, 5);
841    unsafe { vcmppd256(a, b, IMM5 as u8) }
842}
843
844/// Compares packed single-precision (32-bit) floating-point
845/// elements in `a` and `b` based on the comparison operand
846/// specified by `IMM5`.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
857}
858
859/// Compares packed single-precision (32-bit) floating-point
860/// elements in `a` and `b` based on the comparison operand
861/// specified by `IMM5`.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
870    static_assert_uimm_bits!(IMM5, 5);
871    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
872}
873
874/// Compares the lower double-precision (64-bit) floating-point element in
875/// `a` and `b` based on the comparison operand specified by `IMM5`,
876/// store the result in the lower element of returned vector,
877/// and copies the upper element from `a` to the upper element of returned
878/// vector.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
881#[inline]
882#[target_feature(enable = "avx")]
883#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
884#[rustc_legacy_const_generics(2)]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
887    static_assert_uimm_bits!(IMM5, 5);
888    unsafe { vcmpsd(a, b, IMM5 as i8) }
889}
890
891/// Compares the lower single-precision (32-bit) floating-point element in
892/// `a` and `b` based on the comparison operand specified by `IMM5`,
893/// store the result in the lower element of returned vector,
894/// and copies the upper 3 packed elements from `a` to the upper elements of
895/// returned vector.
896///
897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
898#[inline]
899#[target_feature(enable = "avx")]
900#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
901#[rustc_legacy_const_generics(2)]
902#[stable(feature = "simd_x86", since = "1.27.0")]
903pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
904    static_assert_uimm_bits!(IMM5, 5);
905    unsafe { vcmpss(a, b, IMM5 as i8) }
906}
907
908/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
909/// floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtdq2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
917pub const fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
918    unsafe { simd_cast(a.as_i32x4()) }
919}
920
921/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
922/// floating-point elements.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
925#[inline]
926#[target_feature(enable = "avx")]
927#[cfg_attr(test, assert_instr(vcvtdq2ps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
930pub const fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
931    unsafe { simd_cast(a.as_i32x8()) }
932}
933
934/// Converts packed double-precision (64-bit) floating-point elements in `a`
935/// to packed single-precision (32-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
938#[inline]
939#[target_feature(enable = "avx")]
940#[cfg_attr(test, assert_instr(vcvtpd2ps))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
943pub const fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
944    unsafe { simd_cast(a) }
945}
946
947/// Converts packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvtps2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
956    unsafe { transmute(vcvtps2dq(a)) }
957}
958
959/// Converts packed single-precision (32-bit) floating-point elements in `a`
960/// to packed double-precision (64-bit) floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtps2pd))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
968pub const fn _mm256_cvtps_pd(a: __m128) -> __m256d {
969    unsafe { simd_cast(a) }
970}
971
972/// Returns the first element of the input vector of `[4 x double]`.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
975#[inline]
976#[target_feature(enable = "avx")]
977//#[cfg_attr(test, assert_instr(movsd))] FIXME
978#[stable(feature = "simd_x86", since = "1.27.0")]
979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
980pub const fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
981    unsafe { simd_extract!(a, 0) }
982}
983
984/// Converts packed double-precision (64-bit) floating-point elements in `a`
985/// to packed 32-bit integers with truncation.
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
988#[inline]
989#[target_feature(enable = "avx")]
990#[cfg_attr(test, assert_instr(vcvttpd2dq))]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
993    unsafe { transmute(vcvttpd2dq(a)) }
994}
995
996/// Converts packed double-precision (64-bit) floating-point elements in `a`
997/// to packed 32-bit integers.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
1000#[inline]
1001#[target_feature(enable = "avx")]
1002#[cfg_attr(test, assert_instr(vcvtpd2dq))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
1005    unsafe { transmute(vcvtpd2dq(a)) }
1006}
1007
1008/// Converts packed single-precision (32-bit) floating-point elements in `a`
1009/// to packed 32-bit integers with truncation.
1010///
1011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
1012#[inline]
1013#[target_feature(enable = "avx")]
1014#[cfg_attr(test, assert_instr(vcvttps2dq))]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
1017    unsafe { transmute(vcvttps2dq(a)) }
1018}
1019
1020/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
1021/// floating-point elements) from `a`, selected with `imm8`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
1024#[inline]
1025#[target_feature(enable = "avx")]
1026#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1027#[rustc_legacy_const_generics(1)]
1028#[stable(feature = "simd_x86", since = "1.27.0")]
1029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1030pub const fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
1031    static_assert_uimm_bits!(IMM1, 1);
1032    unsafe {
1033        simd_shuffle!(
1034            a,
1035            _mm256_undefined_ps(),
1036            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
1037        )
1038    }
1039}
1040
1041/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1042/// floating-point elements) from `a`, selected with `imm8`.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1045#[inline]
1046#[target_feature(enable = "avx")]
1047#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1048#[rustc_legacy_const_generics(1)]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1051pub const fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1052    static_assert_uimm_bits!(IMM1, 1);
1053    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1054}
1055
1056/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1062#[rustc_legacy_const_generics(1)]
1063#[stable(feature = "simd_x86", since = "1.27.0")]
1064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1065pub const fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1066    static_assert_uimm_bits!(IMM1, 1);
1067    unsafe {
1068        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1069        transmute(dst)
1070    }
1071}
1072
1073/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078// This intrinsic has no corresponding instruction.
1079#[rustc_legacy_const_generics(1)]
1080#[stable(feature = "simd_x86", since = "1.27.0")]
1081#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1082pub const fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1083    static_assert_uimm_bits!(INDEX, 3);
1084    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1085}
1086
1087/// Returns the first element of the input vector of `[8 x i32]`.
1088///
1089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1090#[inline]
1091#[target_feature(enable = "avx")]
1092#[stable(feature = "simd_x86", since = "1.27.0")]
1093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1094pub const fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1095    unsafe { simd_extract!(a.as_i32x8(), 0) }
1096}
1097
1098/// Zeroes the contents of all XMM or YMM registers.
1099///
1100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1101#[inline]
1102#[target_feature(enable = "avx")]
1103#[cfg_attr(test, assert_instr(vzeroall))]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_zeroall() {
1106    unsafe { vzeroall() }
1107}
1108
1109/// Zeroes the upper 128 bits of all YMM registers;
1110/// the lower 128-bits of the registers are unmodified.
1111///
1112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1113#[inline]
1114#[target_feature(enable = "avx")]
1115#[cfg_attr(test, assert_instr(vzeroupper))]
1116#[stable(feature = "simd_x86", since = "1.27.0")]
1117pub fn _mm256_zeroupper() {
1118    unsafe { vzeroupper() }
1119}
1120
1121/// Shuffles single-precision (32-bit) floating-point elements in `a`
1122/// within 128-bit lanes using the control in `b`.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1125#[inline]
1126#[target_feature(enable = "avx")]
1127#[cfg_attr(test, assert_instr(vpermilps))]
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1130    unsafe { vpermilps256(a, b.as_i32x8()) }
1131}
1132
1133/// Shuffles single-precision (32-bit) floating-point elements in `a`
1134/// using the control in `b`.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1137#[inline]
1138#[target_feature(enable = "avx")]
1139#[cfg_attr(test, assert_instr(vpermilps))]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1142    unsafe { vpermilps(a, b.as_i32x4()) }
1143}
1144
1145/// Shuffles single-precision (32-bit) floating-point elements in `a`
1146/// within 128-bit lanes using the control in `imm8`.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1149#[inline]
1150#[target_feature(enable = "avx")]
1151#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1152#[rustc_legacy_const_generics(1)]
1153#[stable(feature = "simd_x86", since = "1.27.0")]
1154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1155pub const fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1156    static_assert_uimm_bits!(IMM8, 8);
1157    unsafe {
1158        simd_shuffle!(
1159            a,
1160            _mm256_undefined_ps(),
1161            [
1162                (IMM8 as u32 >> 0) & 0b11,
1163                (IMM8 as u32 >> 2) & 0b11,
1164                (IMM8 as u32 >> 4) & 0b11,
1165                (IMM8 as u32 >> 6) & 0b11,
1166                ((IMM8 as u32 >> 0) & 0b11) + 4,
1167                ((IMM8 as u32 >> 2) & 0b11) + 4,
1168                ((IMM8 as u32 >> 4) & 0b11) + 4,
1169                ((IMM8 as u32 >> 6) & 0b11) + 4,
1170            ],
1171        )
1172    }
1173}
1174
1175/// Shuffles single-precision (32-bit) floating-point elements in `a`
1176/// using the control in `imm8`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1182#[rustc_legacy_const_generics(1)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1185pub const fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1186    static_assert_uimm_bits!(IMM8, 8);
1187    unsafe {
1188        simd_shuffle!(
1189            a,
1190            _mm_undefined_ps(),
1191            [
1192                (IMM8 as u32 >> 0) & 0b11,
1193                (IMM8 as u32 >> 2) & 0b11,
1194                (IMM8 as u32 >> 4) & 0b11,
1195                (IMM8 as u32 >> 6) & 0b11,
1196            ],
1197        )
1198    }
1199}
1200
1201/// Shuffles double-precision (64-bit) floating-point elements in `a`
1202/// within 256-bit lanes using the control in `b`.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1205#[inline]
1206#[target_feature(enable = "avx")]
1207#[cfg_attr(test, assert_instr(vpermilpd))]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1210    unsafe { vpermilpd256(a, b.as_i64x4()) }
1211}
1212
1213/// Shuffles double-precision (64-bit) floating-point elements in `a`
1214/// using the control in `b`.
1215///
1216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1217#[inline]
1218#[target_feature(enable = "avx")]
1219#[cfg_attr(test, assert_instr(vpermilpd))]
1220#[stable(feature = "simd_x86", since = "1.27.0")]
1221pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1222    unsafe { vpermilpd(a, b.as_i64x2()) }
1223}
1224
1225/// Shuffles double-precision (64-bit) floating-point elements in `a`
1226/// within 128-bit lanes using the control in `imm8`.
1227///
1228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1229#[inline]
1230#[target_feature(enable = "avx")]
1231#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1232#[rustc_legacy_const_generics(1)]
1233#[stable(feature = "simd_x86", since = "1.27.0")]
1234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1235pub const fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1236    static_assert_uimm_bits!(IMM4, 4);
1237    unsafe {
1238        simd_shuffle!(
1239            a,
1240            _mm256_undefined_pd(),
1241            [
1242                ((IMM4 as u32 >> 0) & 1),
1243                ((IMM4 as u32 >> 1) & 1),
1244                ((IMM4 as u32 >> 2) & 1) + 2,
1245                ((IMM4 as u32 >> 3) & 1) + 2,
1246            ],
1247        )
1248    }
1249}
1250
1251/// Shuffles double-precision (64-bit) floating-point elements in `a`
1252/// using the control in `imm8`.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1255#[inline]
1256#[target_feature(enable = "avx")]
1257#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1258#[rustc_legacy_const_generics(1)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1261pub const fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1262    static_assert_uimm_bits!(IMM2, 2);
1263    unsafe {
1264        simd_shuffle!(
1265            a,
1266            _mm_undefined_pd(),
1267            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1268        )
1269    }
1270}
1271
1272/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1273/// floating-point elements) selected by `imm8` from `a` and `b`.
1274///
1275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1276#[inline]
1277#[target_feature(enable = "avx")]
1278#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1279#[rustc_legacy_const_generics(2)]
1280#[stable(feature = "simd_x86", since = "1.27.0")]
1281#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1282pub const fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1283    static_assert_uimm_bits!(IMM8, 8);
1284    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1285        _mm256_castps_si256(a),
1286        _mm256_castps_si256(b),
1287    ))
1288}
1289
1290/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1291/// floating-point elements) selected by `imm8` from `a` and `b`.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1294#[inline]
1295#[target_feature(enable = "avx")]
1296#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1297#[rustc_legacy_const_generics(2)]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1300pub const fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1301    static_assert_uimm_bits!(IMM8, 8);
1302    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1303        _mm256_castpd_si256(a),
1304        _mm256_castpd_si256(b),
1305    ))
1306}
1307
1308/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1309/// from `a` and `b`.
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1312#[inline]
1313#[target_feature(enable = "avx")]
1314#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1315#[rustc_legacy_const_generics(2)]
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1318pub const fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1319    static_assert_uimm_bits!(IMM8, 8);
1320    const fn idx(imm8: i32, pos: u32) -> u32 {
1321        let part = if pos < 2 {
1322            imm8 & 0xf
1323        } else {
1324            (imm8 & 0xf0) >> 4
1325        };
1326        2 * (part as u32 & 0b11) + (pos & 1)
1327    }
1328    const fn idx0(imm8: i32, pos: u32) -> u32 {
1329        let part = if pos < 2 {
1330            imm8 & 0xf
1331        } else {
1332            (imm8 & 0xf0) >> 4
1333        };
1334        if part & 0b1000 != 0 { 4 } else { pos }
1335    }
1336    unsafe {
1337        let r = simd_shuffle!(
1338            a.as_i64x4(),
1339            b.as_i64x4(),
1340            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1341        );
1342        let r: i64x4 = simd_shuffle!(
1343            r,
1344            i64x4::ZERO,
1345            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1346        );
1347        r.as_m256i()
1348    }
1349}
1350
1351/// Broadcasts a single-precision (32-bit) floating-point element from memory
1352/// to all elements of the returned vector.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1355#[inline]
1356#[target_feature(enable = "avx")]
1357#[cfg_attr(test, assert_instr(vbroadcastss))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[allow(clippy::trivially_copy_pass_by_ref)]
1360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1361pub const fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1362    _mm256_set1_ps(*f)
1363}
1364
1365/// Broadcasts a single-precision (32-bit) floating-point element from memory
1366/// to all elements of the returned vector.
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1369#[inline]
1370#[target_feature(enable = "avx")]
1371#[cfg_attr(test, assert_instr(vbroadcastss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373#[allow(clippy::trivially_copy_pass_by_ref)]
1374#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1375pub const fn _mm_broadcast_ss(f: &f32) -> __m128 {
1376    _mm_set1_ps(*f)
1377}
1378
1379/// Broadcasts a double-precision (64-bit) floating-point element from memory
1380/// to all elements of the returned vector.
1381///
1382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1383#[inline]
1384#[target_feature(enable = "avx")]
1385#[cfg_attr(test, assert_instr(vbroadcastsd))]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387#[allow(clippy::trivially_copy_pass_by_ref)]
1388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1389pub const fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1390    _mm256_set1_pd(*f)
1391}
1392
1393/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1394/// (32-bit) floating-point elements) to all elements of the returned vector.
1395///
1396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1397#[inline]
1398#[target_feature(enable = "avx")]
1399#[cfg_attr(test, assert_instr(vbroadcastf128))]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1402pub const fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1403    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1404}
1405
1406/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1407/// (64-bit) floating-point elements) to all elements of the returned vector.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412#[cfg_attr(test, assert_instr(vbroadcastf128))]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1415pub const fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1416    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1417}
1418
1419/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1420/// single-precision (32-bit) floating-point elements) from `b` into result
1421/// at the location specified by `imm8`.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1427#[rustc_legacy_const_generics(2)]
1428#[stable(feature = "simd_x86", since = "1.27.0")]
1429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1430pub const fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1431    static_assert_uimm_bits!(IMM1, 1);
1432    unsafe {
1433        simd_shuffle!(
1434            a,
1435            _mm256_castps128_ps256(b),
1436            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1437        )
1438    }
1439}
1440
1441/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1442/// double-precision (64-bit) floating-point elements) from `b` into result
1443/// at the location specified by `imm8`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1452pub const fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1453    static_assert_uimm_bits!(IMM1, 1);
1454    unsafe {
1455        simd_shuffle!(
1456            a,
1457            _mm256_castpd128_pd256(b),
1458            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1459        )
1460    }
1461}
1462
1463/// Copies `a` to result, then inserts 128 bits from `b` into result
1464/// at the location specified by `imm8`.
1465///
1466/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1467#[inline]
1468#[target_feature(enable = "avx")]
1469#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1470#[rustc_legacy_const_generics(2)]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1473pub const fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1474    static_assert_uimm_bits!(IMM1, 1);
1475    unsafe {
1476        let dst: i64x4 = simd_shuffle!(
1477            a.as_i64x4(),
1478            _mm256_castsi128_si256(b).as_i64x4(),
1479            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1480        );
1481        transmute(dst)
1482    }
1483}
1484
1485/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1486/// at the location specified by `index`.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491// This intrinsic has no corresponding instruction.
1492#[rustc_legacy_const_generics(2)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1495pub const fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1496    static_assert_uimm_bits!(INDEX, 5);
1497    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1498}
1499
1500/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1501/// at the location specified by `index`.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506// This intrinsic has no corresponding instruction.
1507#[rustc_legacy_const_generics(2)]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1510pub const fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1511    static_assert_uimm_bits!(INDEX, 4);
1512    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1513}
1514
1515/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1516/// at the location specified by `index`.
1517///
1518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1519#[inline]
1520#[target_feature(enable = "avx")]
1521// This intrinsic has no corresponding instruction.
1522#[rustc_legacy_const_generics(2)]
1523#[stable(feature = "simd_x86", since = "1.27.0")]
1524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1525pub const fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1526    static_assert_uimm_bits!(INDEX, 3);
1527    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1528}
1529
1530/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` must be aligned on a 32-byte boundary or a
1533/// general-protection exception may be generated.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1536#[inline]
1537#[target_feature(enable = "avx")]
1538#[cfg_attr(
1539    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1540    assert_instr(vmovap)
1541)]
1542#[stable(feature = "simd_x86", since = "1.27.0")]
1543#[allow(clippy::cast_ptr_alignment)]
1544#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1545pub const unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1546    *(mem_addr as *const __m256d)
1547}
1548
1549/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` must be aligned on a 32-byte boundary or a
1552/// general-protection exception may be generated.
1553///
1554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1555#[inline]
1556#[target_feature(enable = "avx")]
1557#[cfg_attr(
1558    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1559    assert_instr(vmovap)
1560)]
1561#[stable(feature = "simd_x86", since = "1.27.0")]
1562#[allow(clippy::cast_ptr_alignment)]
1563#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1564pub const unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1565    *(mem_addr as *mut __m256d) = a;
1566}
1567
1568/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1569/// floating-point elements) from memory into result.
1570/// `mem_addr` must be aligned on a 32-byte boundary or a
1571/// general-protection exception may be generated.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1574#[inline]
1575#[target_feature(enable = "avx")]
1576#[cfg_attr(
1577    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1578    assert_instr(vmovaps)
1579)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581#[allow(clippy::cast_ptr_alignment)]
1582#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1583pub const unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1584    *(mem_addr as *const __m256)
1585}
1586
1587/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1588/// floating-point elements) from `a` into memory.
1589/// `mem_addr` must be aligned on a 32-byte boundary or a
1590/// general-protection exception may be generated.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1593#[inline]
1594#[target_feature(enable = "avx")]
1595#[cfg_attr(
1596    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1597    assert_instr(vmovaps)
1598)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600#[allow(clippy::cast_ptr_alignment)]
1601#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1602pub const unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1603    *(mem_addr as *mut __m256) = a;
1604}
1605
1606/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1607/// floating-point elements) from memory into result.
1608/// `mem_addr` does not need to be aligned on any particular boundary.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(test, assert_instr(vmovup))]
1614#[stable(feature = "simd_x86", since = "1.27.0")]
1615#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1616pub const unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1617    let mut dst = _mm256_undefined_pd();
1618    ptr::copy_nonoverlapping(
1619        mem_addr as *const u8,
1620        ptr::addr_of_mut!(dst) as *mut u8,
1621        mem::size_of::<__m256d>(),
1622    );
1623    dst
1624}
1625
1626/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1627/// floating-point elements) from `a` into memory.
1628/// `mem_addr` does not need to be aligned on any particular boundary.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1631#[inline]
1632#[target_feature(enable = "avx")]
1633#[cfg_attr(test, assert_instr(vmovup))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1636pub const unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1637    mem_addr.cast::<__m256d>().write_unaligned(a);
1638}
1639
1640/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1641/// floating-point elements) from memory into result.
1642/// `mem_addr` does not need to be aligned on any particular boundary.
1643///
1644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1645#[inline]
1646#[target_feature(enable = "avx")]
1647#[cfg_attr(test, assert_instr(vmovups))]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1650pub const unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1651    let mut dst = _mm256_undefined_ps();
1652    ptr::copy_nonoverlapping(
1653        mem_addr as *const u8,
1654        ptr::addr_of_mut!(dst) as *mut u8,
1655        mem::size_of::<__m256>(),
1656    );
1657    dst
1658}
1659
1660/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1661/// floating-point elements) from `a` into memory.
1662/// `mem_addr` does not need to be aligned on any particular boundary.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1665#[inline]
1666#[target_feature(enable = "avx")]
1667#[cfg_attr(test, assert_instr(vmovups))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1670pub const unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1671    mem_addr.cast::<__m256>().write_unaligned(a);
1672}
1673
1674/// Loads 256-bits of integer data from memory into result.
1675/// `mem_addr` must be aligned on a 32-byte boundary or a
1676/// general-protection exception may be generated.
1677///
1678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1679#[inline]
1680#[target_feature(enable = "avx")]
1681#[cfg_attr(
1682    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1683    assert_instr(vmovaps)
1684)] // FIXME vmovdqa expected
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1688    *mem_addr
1689}
1690
1691/// Stores 256-bits of integer data from `a` into memory.
1692/// `mem_addr` must be aligned on a 32-byte boundary or a
1693/// general-protection exception may be generated.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(
1699    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1700    assert_instr(vmovaps)
1701)] // FIXME vmovdqa expected
1702#[stable(feature = "simd_x86", since = "1.27.0")]
1703#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1704pub const unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1705    *mem_addr = a;
1706}
1707
1708/// Loads 256-bits of integer data from memory into result.
1709/// `mem_addr` does not need to be aligned on any particular boundary.
1710///
1711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1712#[inline]
1713#[target_feature(enable = "avx")]
1714#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1717pub const unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1718    let mut dst = _mm256_undefined_si256();
1719    ptr::copy_nonoverlapping(
1720        mem_addr as *const u8,
1721        ptr::addr_of_mut!(dst) as *mut u8,
1722        mem::size_of::<__m256i>(),
1723    );
1724    dst
1725}
1726
1727/// Stores 256-bits of integer data from `a` into memory.
1728/// `mem_addr` does not need to be aligned on any particular boundary.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1736pub const unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1737    mem_addr.write_unaligned(a);
1738}
1739
1740/// Loads packed double-precision (64-bit) floating-point elements from memory
1741/// into result using `mask` (elements are zeroed out when the high bit of the
1742/// corresponding element is not set).
1743///
1744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1745#[inline]
1746#[target_feature(enable = "avx")]
1747#[cfg_attr(test, assert_instr(vmaskmovpd))]
1748#[stable(feature = "simd_x86", since = "1.27.0")]
1749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1750pub const unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1751    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1752    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1753}
1754
1755/// Stores packed double-precision (64-bit) floating-point elements from `a`
1756/// into memory using `mask`.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vmaskmovpd))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1764pub const unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1765    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1766    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1767}
1768
1769/// Loads packed double-precision (64-bit) floating-point elements from memory
1770/// into result using `mask` (elements are zeroed out when the high bit of the
1771/// corresponding element is not set).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1774#[inline]
1775#[target_feature(enable = "avx")]
1776#[cfg_attr(test, assert_instr(vmaskmovpd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1779pub const unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1780    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1781    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1782}
1783
1784/// Stores packed double-precision (64-bit) floating-point elements from `a`
1785/// into memory using `mask`.
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1788#[inline]
1789#[target_feature(enable = "avx")]
1790#[cfg_attr(test, assert_instr(vmaskmovpd))]
1791#[stable(feature = "simd_x86", since = "1.27.0")]
1792#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1793pub const unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1794    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1795    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1796}
1797
1798/// Loads packed single-precision (32-bit) floating-point elements from memory
1799/// into result using `mask` (elements are zeroed out when the high bit of the
1800/// corresponding element is not set).
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1803#[inline]
1804#[target_feature(enable = "avx")]
1805#[cfg_attr(test, assert_instr(vmaskmovps))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1808pub const unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1809    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1810    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1811}
1812
1813/// Stores packed single-precision (32-bit) floating-point elements from `a`
1814/// into memory using `mask`.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vmaskmovps))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1822pub const unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1823    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1824    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1825}
1826
1827/// Loads packed single-precision (32-bit) floating-point elements from memory
1828/// into result using `mask` (elements are zeroed out when the high bit of the
1829/// corresponding element is not set).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1832#[inline]
1833#[target_feature(enable = "avx")]
1834#[cfg_attr(test, assert_instr(vmaskmovps))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1837pub const unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1838    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1840}
1841
1842/// Stores packed single-precision (32-bit) floating-point elements from `a`
1843/// into memory using `mask`.
1844///
1845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1846#[inline]
1847#[target_feature(enable = "avx")]
1848#[cfg_attr(test, assert_instr(vmaskmovps))]
1849#[stable(feature = "simd_x86", since = "1.27.0")]
1850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1851pub const unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1854}
1855
1856/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1857/// from `a`, and returns the results.
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1860#[inline]
1861#[target_feature(enable = "avx")]
1862#[cfg_attr(test, assert_instr(vmovshdup))]
1863#[stable(feature = "simd_x86", since = "1.27.0")]
1864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1865pub const fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1866    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1867}
1868
1869/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1870/// from `a`, and returns the results.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1873#[inline]
1874#[target_feature(enable = "avx")]
1875#[cfg_attr(test, assert_instr(vmovsldup))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1878pub const fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1879    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1880}
1881
1882/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1883/// from `a`, and returns the results.
1884///
1885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1886#[inline]
1887#[target_feature(enable = "avx")]
1888#[cfg_attr(test, assert_instr(vmovddup))]
1889#[stable(feature = "simd_x86", since = "1.27.0")]
1890#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1891pub const fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1892    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1893}
1894
1895/// Loads 256-bits of integer data from unaligned memory into result.
1896/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1897/// data crosses a cache line boundary.
1898///
1899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1900#[inline]
1901#[target_feature(enable = "avx")]
1902#[cfg_attr(test, assert_instr(vlddqu))]
1903#[stable(feature = "simd_x86", since = "1.27.0")]
1904pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1905    transmute(vlddqu(mem_addr as *const i8))
1906}
1907
1908/// Moves integer data from a 256-bit integer vector to a 32-byte
1909/// aligned memory location. To minimize caching, the data is flagged as
1910/// non-temporal (unlikely to be used again soon)
1911///
1912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1913///
1914/// # Safety of non-temporal stores
1915///
1916/// After using this intrinsic, but before any other access to the memory that this intrinsic
1917/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1918/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1919/// return.
1920///
1921/// See [`_mm_sfence`] for details.
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vmovntdq))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1927    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1928    crate::arch::asm!(
1929        vps!("vmovntdq", ",{a}"),
1930        p = in(reg) mem_addr,
1931        a = in(ymm_reg) a,
1932        options(nostack, preserves_flags),
1933    );
1934}
1935
1936/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1937/// to a 32-byte aligned memory location. To minimize caching, the data is
1938/// flagged as non-temporal (unlikely to be used again soon).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1941///
1942/// # Safety of non-temporal stores
1943///
1944/// After using this intrinsic, but before any other access to the memory that this intrinsic
1945/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1946/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1947/// return.
1948///
1949/// See [`_mm_sfence`] for details.
1950#[inline]
1951#[target_feature(enable = "avx")]
1952#[cfg_attr(test, assert_instr(vmovntpd))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954#[allow(clippy::cast_ptr_alignment)]
1955pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1956    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1957    crate::arch::asm!(
1958        vps!("vmovntpd", ",{a}"),
1959        p = in(reg) mem_addr,
1960        a = in(ymm_reg) a,
1961        options(nostack, preserves_flags),
1962    );
1963}
1964
1965/// Moves single-precision floating point values from a 256-bit vector
1966/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1967/// caching, the data is flagged as non-temporal (unlikely to be used again
1968/// soon).
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1971///
1972/// # Safety of non-temporal stores
1973///
1974/// After using this intrinsic, but before any other access to the memory that this intrinsic
1975/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1976/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1977/// return.
1978///
1979/// See [`_mm_sfence`] for details.
1980#[inline]
1981#[target_feature(enable = "avx")]
1982#[cfg_attr(test, assert_instr(vmovntps))]
1983#[stable(feature = "simd_x86", since = "1.27.0")]
1984#[allow(clippy::cast_ptr_alignment)]
1985pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1986    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1987    crate::arch::asm!(
1988        vps!("vmovntps", ",{a}"),
1989        p = in(reg) mem_addr,
1990        a = in(ymm_reg) a,
1991        options(nostack, preserves_flags),
1992    );
1993}
1994
1995/// Computes the approximate reciprocal of packed single-precision (32-bit)
1996/// floating-point elements in `a`, and returns the results. The maximum
1997/// relative error for this approximation is less than 1.5*2^-12.
1998///
1999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
2000#[inline]
2001#[target_feature(enable = "avx")]
2002#[cfg_attr(test, assert_instr(vrcpps))]
2003#[stable(feature = "simd_x86", since = "1.27.0")]
2004pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
2005    unsafe { vrcpps(a) }
2006}
2007
2008/// Computes the approximate reciprocal square root of packed single-precision
2009/// (32-bit) floating-point elements in `a`, and returns the results.
2010/// The maximum relative error for this approximation is less than 1.5*2^-12.
2011///
2012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
2013#[inline]
2014#[target_feature(enable = "avx")]
2015#[cfg_attr(test, assert_instr(vrsqrtps))]
2016#[stable(feature = "simd_x86", since = "1.27.0")]
2017pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
2018    unsafe { vrsqrtps(a) }
2019}
2020
2021/// Unpacks and interleave double-precision (64-bit) floating-point elements
2022/// from the high half of each 128-bit lane in `a` and `b`.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
2025#[inline]
2026#[target_feature(enable = "avx")]
2027#[cfg_attr(test, assert_instr(vunpckhpd))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2030pub const fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
2031    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
2032}
2033
2034/// Unpacks and interleave single-precision (32-bit) floating-point elements
2035/// from the high half of each 128-bit lane in `a` and `b`.
2036///
2037/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
2038#[inline]
2039#[target_feature(enable = "avx")]
2040#[cfg_attr(test, assert_instr(vunpckhps))]
2041#[stable(feature = "simd_x86", since = "1.27.0")]
2042#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2043pub const fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
2044    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
2045}
2046
2047/// Unpacks and interleave double-precision (64-bit) floating-point elements
2048/// from the low half of each 128-bit lane in `a` and `b`.
2049///
2050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
2051#[inline]
2052#[target_feature(enable = "avx")]
2053#[cfg_attr(test, assert_instr(vunpcklpd))]
2054#[stable(feature = "simd_x86", since = "1.27.0")]
2055#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2056pub const fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
2057    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
2058}
2059
2060/// Unpacks and interleave single-precision (32-bit) floating-point elements
2061/// from the low half of each 128-bit lane in `a` and `b`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vunpcklps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2069pub const fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
2070    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
2071}
2072
2073/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2074/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2075/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2076/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vptest))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2084pub const fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
2085    unsafe {
2086        let r = simd_and(a.as_i64x4(), b.as_i64x4());
2087        (0i64 == simd_reduce_or(r)) as i32
2088    }
2089}
2090
2091/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2092/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2093/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2094/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2095///
2096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2097#[inline]
2098#[target_feature(enable = "avx")]
2099#[cfg_attr(test, assert_instr(vptest))]
2100#[stable(feature = "simd_x86", since = "1.27.0")]
2101#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2102pub const fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2103    unsafe {
2104        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2105        (0i64 == simd_reduce_or(r)) as i32
2106    }
2107}
2108
2109/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2110/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2111/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2112/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2113/// `CF` values are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vptest))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2121    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2122}
2123
2124/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestpd))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2138    unsafe { vtestzpd256(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestpd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2155    unsafe { vtestcpd256(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2173    unsafe { vtestnzcpd256(a, b) }
2174}
2175
2176/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2177/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2178/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2179/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2180/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2181/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2182/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2185#[inline]
2186#[target_feature(enable = "avx")]
2187#[cfg_attr(test, assert_instr(vtestpd))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2190pub const fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2191    unsafe {
2192        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2193        (0i64 == simd_reduce_or(r)) as i32
2194    }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestpd))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2211pub const fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2212    unsafe {
2213        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2214        (0i64 == simd_reduce_or(r)) as i32
2215    }
2216}
2217
2218/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2219/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2220/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2221/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2222/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2223/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2224/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2225/// are zero, otherwise return 0.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230#[cfg_attr(test, assert_instr(vtestpd))]
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2233    unsafe { vtestnzcpd(a, b) }
2234}
2235
2236/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2237/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2238/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2239/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2240/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2241/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2242/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2245#[inline]
2246#[target_feature(enable = "avx")]
2247#[cfg_attr(test, assert_instr(vtestps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2250    unsafe { vtestzps256(a, b) }
2251}
2252
2253/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2254/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2255/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2256/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2257/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2258/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2259/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264#[cfg_attr(test, assert_instr(vtestps))]
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2267    unsafe { vtestcps256(a, b) }
2268}
2269
2270/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2271/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2272/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2273/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2274/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2275/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2276/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2277/// are zero, otherwise return 0.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2280#[inline]
2281#[target_feature(enable = "avx")]
2282#[cfg_attr(test, assert_instr(vtestps))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2285    unsafe { vtestnzcps256(a, b) }
2286}
2287
2288/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2289/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2290/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2291/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2292/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2293/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2294/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vtestps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2302pub const fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2303    unsafe {
2304        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2305        (0i32 == simd_reduce_or(r)) as i32
2306    }
2307}
2308
2309/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2310/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2311/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2312/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2313/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2314/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2315/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2318#[inline]
2319#[target_feature(enable = "avx")]
2320#[cfg_attr(test, assert_instr(vtestps))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2323pub const fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2324    unsafe {
2325        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2326        (0i32 == simd_reduce_or(r)) as i32
2327    }
2328}
2329
2330/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2331/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2332/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2333/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2334/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2335/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2336/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2337/// are zero, otherwise return 0.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2340#[inline]
2341#[target_feature(enable = "avx")]
2342#[cfg_attr(test, assert_instr(vtestps))]
2343#[stable(feature = "simd_x86", since = "1.27.0")]
2344pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2345    unsafe { vtestnzcps(a, b) }
2346}
2347
2348/// Sets each bit of the returned mask based on the most significant bit of the
2349/// corresponding packed double-precision (64-bit) floating-point element in
2350/// `a`.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2353#[inline]
2354#[target_feature(enable = "avx")]
2355#[cfg_attr(test, assert_instr(vmovmskpd))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2358pub const fn _mm256_movemask_pd(a: __m256d) -> i32 {
2359    // Propagate the highest bit to the rest, because simd_bitmask
2360    // requires all-1 or all-0.
2361    unsafe {
2362        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2363        simd_bitmask::<i64x4, u8>(mask) as i32
2364    }
2365}
2366
2367/// Sets each bit of the returned mask based on the most significant bit of the
2368/// corresponding packed single-precision (32-bit) floating-point element in
2369/// `a`.
2370///
2371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2372#[inline]
2373#[target_feature(enable = "avx")]
2374#[cfg_attr(test, assert_instr(vmovmskps))]
2375#[stable(feature = "simd_x86", since = "1.27.0")]
2376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2377pub const fn _mm256_movemask_ps(a: __m256) -> i32 {
2378    // Propagate the highest bit to the rest, because simd_bitmask
2379    // requires all-1 or all-0.
2380    unsafe {
2381        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2382        simd_bitmask::<i32x8, u8>(mask) as i32
2383    }
2384}
2385
2386/// Returns vector of type __m256d with all elements set to zero.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391#[cfg_attr(test, assert_instr(vxorp))]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2394pub const fn _mm256_setzero_pd() -> __m256d {
2395    const { unsafe { mem::zeroed() } }
2396}
2397
2398/// Returns vector of type __m256 with all elements set to zero.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403#[cfg_attr(test, assert_instr(vxorps))]
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm256_setzero_ps() -> __m256 {
2407    const { unsafe { mem::zeroed() } }
2408}
2409
2410/// Returns vector of type __m256i with all elements set to zero.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415#[cfg_attr(test, assert_instr(vxor))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2418pub const fn _mm256_setzero_si256() -> __m256i {
2419    const { unsafe { mem::zeroed() } }
2420}
2421
2422/// Sets packed double-precision (64-bit) floating-point elements in returned
2423/// vector with the supplied values.
2424///
2425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2426#[inline]
2427#[target_feature(enable = "avx")]
2428// This intrinsic has no corresponding instruction.
2429#[cfg_attr(test, assert_instr(vinsertf128))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2432pub const fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2433    _mm256_setr_pd(d, c, b, a)
2434}
2435
2436/// Sets packed single-precision (32-bit) floating-point elements in returned
2437/// vector with the supplied values.
2438///
2439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2440#[inline]
2441#[target_feature(enable = "avx")]
2442// This intrinsic has no corresponding instruction.
2443#[stable(feature = "simd_x86", since = "1.27.0")]
2444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2445pub const fn _mm256_set_ps(
2446    a: f32,
2447    b: f32,
2448    c: f32,
2449    d: f32,
2450    e: f32,
2451    f: f32,
2452    g: f32,
2453    h: f32,
2454) -> __m256 {
2455    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2456}
2457
2458/// Sets packed 8-bit integers in returned vector with the supplied values.
2459///
2460/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2461#[inline]
2462#[target_feature(enable = "avx")]
2463// This intrinsic has no corresponding instruction.
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2466pub const fn _mm256_set_epi8(
2467    e00: i8,
2468    e01: i8,
2469    e02: i8,
2470    e03: i8,
2471    e04: i8,
2472    e05: i8,
2473    e06: i8,
2474    e07: i8,
2475    e08: i8,
2476    e09: i8,
2477    e10: i8,
2478    e11: i8,
2479    e12: i8,
2480    e13: i8,
2481    e14: i8,
2482    e15: i8,
2483    e16: i8,
2484    e17: i8,
2485    e18: i8,
2486    e19: i8,
2487    e20: i8,
2488    e21: i8,
2489    e22: i8,
2490    e23: i8,
2491    e24: i8,
2492    e25: i8,
2493    e26: i8,
2494    e27: i8,
2495    e28: i8,
2496    e29: i8,
2497    e30: i8,
2498    e31: i8,
2499) -> __m256i {
2500    #[rustfmt::skip]
2501    _mm256_setr_epi8(
2502        e31, e30, e29, e28, e27, e26, e25, e24,
2503        e23, e22, e21, e20, e19, e18, e17, e16,
2504        e15, e14, e13, e12, e11, e10, e09, e08,
2505        e07, e06, e05, e04, e03, e02, e01, e00,
2506    )
2507}
2508
2509/// Sets packed 16-bit integers in returned vector with the supplied values.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2512#[inline]
2513#[target_feature(enable = "avx")]
2514// This intrinsic has no corresponding instruction.
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2517pub const fn _mm256_set_epi16(
2518    e00: i16,
2519    e01: i16,
2520    e02: i16,
2521    e03: i16,
2522    e04: i16,
2523    e05: i16,
2524    e06: i16,
2525    e07: i16,
2526    e08: i16,
2527    e09: i16,
2528    e10: i16,
2529    e11: i16,
2530    e12: i16,
2531    e13: i16,
2532    e14: i16,
2533    e15: i16,
2534) -> __m256i {
2535    #[rustfmt::skip]
2536    _mm256_setr_epi16(
2537        e15, e14, e13, e12,
2538        e11, e10, e09, e08,
2539        e07, e06, e05, e04,
2540        e03, e02, e01, e00,
2541    )
2542}
2543
2544/// Sets packed 32-bit integers in returned vector with the supplied values.
2545///
2546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2547#[inline]
2548#[target_feature(enable = "avx")]
2549// This intrinsic has no corresponding instruction.
2550#[stable(feature = "simd_x86", since = "1.27.0")]
2551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2552pub const fn _mm256_set_epi32(
2553    e0: i32,
2554    e1: i32,
2555    e2: i32,
2556    e3: i32,
2557    e4: i32,
2558    e5: i32,
2559    e6: i32,
2560    e7: i32,
2561) -> __m256i {
2562    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2563}
2564
2565/// Sets packed 64-bit integers in returned vector with the supplied values.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2568#[inline]
2569#[target_feature(enable = "avx")]
2570// This intrinsic has no corresponding instruction.
2571#[stable(feature = "simd_x86", since = "1.27.0")]
2572#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2573pub const fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2574    _mm256_setr_epi64x(d, c, b, a)
2575}
2576
2577/// Sets packed double-precision (64-bit) floating-point elements in returned
2578/// vector with the supplied values in reverse order.
2579///
2580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2581#[inline]
2582#[target_feature(enable = "avx")]
2583// This intrinsic has no corresponding instruction.
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2586pub const fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2587    __m256d([a, b, c, d])
2588}
2589
2590/// Sets packed single-precision (32-bit) floating-point elements in returned
2591/// vector with the supplied values in reverse order.
2592///
2593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2594#[inline]
2595#[target_feature(enable = "avx")]
2596// This intrinsic has no corresponding instruction.
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2599pub const fn _mm256_setr_ps(
2600    a: f32,
2601    b: f32,
2602    c: f32,
2603    d: f32,
2604    e: f32,
2605    f: f32,
2606    g: f32,
2607    h: f32,
2608) -> __m256 {
2609    __m256([a, b, c, d, e, f, g, h])
2610}
2611
2612/// Sets packed 8-bit integers in returned vector with the supplied values in
2613/// reverse order.
2614///
2615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2616#[inline]
2617#[target_feature(enable = "avx")]
2618// This intrinsic has no corresponding instruction.
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2621pub const fn _mm256_setr_epi8(
2622    e00: i8,
2623    e01: i8,
2624    e02: i8,
2625    e03: i8,
2626    e04: i8,
2627    e05: i8,
2628    e06: i8,
2629    e07: i8,
2630    e08: i8,
2631    e09: i8,
2632    e10: i8,
2633    e11: i8,
2634    e12: i8,
2635    e13: i8,
2636    e14: i8,
2637    e15: i8,
2638    e16: i8,
2639    e17: i8,
2640    e18: i8,
2641    e19: i8,
2642    e20: i8,
2643    e21: i8,
2644    e22: i8,
2645    e23: i8,
2646    e24: i8,
2647    e25: i8,
2648    e26: i8,
2649    e27: i8,
2650    e28: i8,
2651    e29: i8,
2652    e30: i8,
2653    e31: i8,
2654) -> __m256i {
2655    unsafe {
2656        #[rustfmt::skip]
2657        transmute(i8x32::new(
2658            e00, e01, e02, e03, e04, e05, e06, e07,
2659            e08, e09, e10, e11, e12, e13, e14, e15,
2660            e16, e17, e18, e19, e20, e21, e22, e23,
2661            e24, e25, e26, e27, e28, e29, e30, e31,
2662        ))
2663    }
2664}
2665
2666/// Sets packed 16-bit integers in returned vector with the supplied values in
2667/// reverse order.
2668///
2669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2670#[inline]
2671#[target_feature(enable = "avx")]
2672// This intrinsic has no corresponding instruction.
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2675pub const fn _mm256_setr_epi16(
2676    e00: i16,
2677    e01: i16,
2678    e02: i16,
2679    e03: i16,
2680    e04: i16,
2681    e05: i16,
2682    e06: i16,
2683    e07: i16,
2684    e08: i16,
2685    e09: i16,
2686    e10: i16,
2687    e11: i16,
2688    e12: i16,
2689    e13: i16,
2690    e14: i16,
2691    e15: i16,
2692) -> __m256i {
2693    unsafe {
2694        #[rustfmt::skip]
2695        transmute(i16x16::new(
2696            e00, e01, e02, e03,
2697            e04, e05, e06, e07,
2698            e08, e09, e10, e11,
2699            e12, e13, e14, e15,
2700        ))
2701    }
2702}
2703
2704/// Sets packed 32-bit integers in returned vector with the supplied values in
2705/// reverse order.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic has no corresponding instruction.
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2713pub const fn _mm256_setr_epi32(
2714    e0: i32,
2715    e1: i32,
2716    e2: i32,
2717    e3: i32,
2718    e4: i32,
2719    e5: i32,
2720    e6: i32,
2721    e7: i32,
2722) -> __m256i {
2723    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2724}
2725
2726/// Sets packed 64-bit integers in returned vector with the supplied values in
2727/// reverse order.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic has no corresponding instruction.
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2735pub const fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2736    unsafe { transmute(i64x4::new(a, b, c, d)) }
2737}
2738
2739/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2740/// elements of returned vector.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic has no corresponding instruction.
2746#[stable(feature = "simd_x86", since = "1.27.0")]
2747#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2748pub const fn _mm256_set1_pd(a: f64) -> __m256d {
2749    _mm256_setr_pd(a, a, a, a)
2750}
2751
2752/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2753/// elements of returned vector.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic has no corresponding instruction.
2759#[stable(feature = "simd_x86", since = "1.27.0")]
2760#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2761pub const fn _mm256_set1_ps(a: f32) -> __m256 {
2762    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2763}
2764
2765/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2766/// This intrinsic may generate the `vpbroadcastb`.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2769#[inline]
2770#[target_feature(enable = "avx")]
2771// This intrinsic has no corresponding instruction.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2774pub const fn _mm256_set1_epi8(a: i8) -> __m256i {
2775    #[rustfmt::skip]
2776    _mm256_setr_epi8(
2777        a, a, a, a, a, a, a, a,
2778        a, a, a, a, a, a, a, a,
2779        a, a, a, a, a, a, a, a,
2780        a, a, a, a, a, a, a, a,
2781    )
2782}
2783
2784/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2785/// This intrinsic may generate the `vpbroadcastw`.
2786///
2787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2788#[inline]
2789#[target_feature(enable = "avx")]
2790//#[cfg_attr(test, assert_instr(vpshufb))]
2791#[cfg_attr(test, assert_instr(vinsertf128))]
2792// This intrinsic has no corresponding instruction.
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2795pub const fn _mm256_set1_epi16(a: i16) -> __m256i {
2796    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2797}
2798
2799/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2800/// This intrinsic may generate the `vpbroadcastd`.
2801///
2802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2803#[inline]
2804#[target_feature(enable = "avx")]
2805// This intrinsic has no corresponding instruction.
2806#[stable(feature = "simd_x86", since = "1.27.0")]
2807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2808pub const fn _mm256_set1_epi32(a: i32) -> __m256i {
2809    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2810}
2811
2812/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2813/// This intrinsic may generate the `vpbroadcastq`.
2814///
2815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2816#[inline]
2817#[target_feature(enable = "avx")]
2818#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2819#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2820// This intrinsic has no corresponding instruction.
2821#[stable(feature = "simd_x86", since = "1.27.0")]
2822#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2823pub const fn _mm256_set1_epi64x(a: i64) -> __m256i {
2824    _mm256_setr_epi64x(a, a, a, a)
2825}
2826
2827/// Cast vector of type __m256d to type __m256.
2828///
2829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2830#[inline]
2831#[target_feature(enable = "avx")]
2832// This intrinsic is only used for compilation and does not generate any
2833// instructions, thus it has zero latency.
2834#[stable(feature = "simd_x86", since = "1.27.0")]
2835#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2836pub const fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2837    unsafe { transmute(a) }
2838}
2839
2840/// Cast vector of type __m256 to type __m256d.
2841///
2842/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2843#[inline]
2844#[target_feature(enable = "avx")]
2845// This intrinsic is only used for compilation and does not generate any
2846// instructions, thus it has zero latency.
2847#[stable(feature = "simd_x86", since = "1.27.0")]
2848#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2849pub const fn _mm256_castps_pd(a: __m256) -> __m256d {
2850    unsafe { transmute(a) }
2851}
2852
2853/// Casts vector of type __m256 to type __m256i.
2854///
2855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2856#[inline]
2857#[target_feature(enable = "avx")]
2858// This intrinsic is only used for compilation and does not generate any
2859// instructions, thus it has zero latency.
2860#[stable(feature = "simd_x86", since = "1.27.0")]
2861#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2862pub const fn _mm256_castps_si256(a: __m256) -> __m256i {
2863    unsafe { transmute(a) }
2864}
2865
2866/// Casts vector of type __m256i to type __m256.
2867///
2868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2869#[inline]
2870#[target_feature(enable = "avx")]
2871// This intrinsic is only used for compilation and does not generate any
2872// instructions, thus it has zero latency.
2873#[stable(feature = "simd_x86", since = "1.27.0")]
2874#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2875pub const fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2876    unsafe { transmute(a) }
2877}
2878
2879/// Casts vector of type __m256d to type __m256i.
2880///
2881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2882#[inline]
2883#[target_feature(enable = "avx")]
2884// This intrinsic is only used for compilation and does not generate any
2885// instructions, thus it has zero latency.
2886#[stable(feature = "simd_x86", since = "1.27.0")]
2887#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2888pub const fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2889    unsafe { transmute(a) }
2890}
2891
2892/// Casts vector of type __m256i to type __m256d.
2893///
2894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2895#[inline]
2896#[target_feature(enable = "avx")]
2897// This intrinsic is only used for compilation and does not generate any
2898// instructions, thus it has zero latency.
2899#[stable(feature = "simd_x86", since = "1.27.0")]
2900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2901pub const fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2902    unsafe { transmute(a) }
2903}
2904
2905/// Casts vector of type __m256 to type __m128.
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2908#[inline]
2909#[target_feature(enable = "avx")]
2910// This intrinsic is only used for compilation and does not generate any
2911// instructions, thus it has zero latency.
2912#[stable(feature = "simd_x86", since = "1.27.0")]
2913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2914pub const fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2915    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2916}
2917
2918/// Casts vector of type __m256d to type __m128d.
2919///
2920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2921#[inline]
2922#[target_feature(enable = "avx")]
2923// This intrinsic is only used for compilation and does not generate any
2924// instructions, thus it has zero latency.
2925#[stable(feature = "simd_x86", since = "1.27.0")]
2926#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2927pub const fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2928    unsafe { simd_shuffle!(a, a, [0, 1]) }
2929}
2930
2931/// Casts vector of type __m256i to type __m128i.
2932///
2933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2934#[inline]
2935#[target_feature(enable = "avx")]
2936// This intrinsic is only used for compilation and does not generate any
2937// instructions, thus it has zero latency.
2938#[stable(feature = "simd_x86", since = "1.27.0")]
2939#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2940pub const fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2941    unsafe {
2942        let a = a.as_i64x4();
2943        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2944        transmute(dst)
2945    }
2946}
2947
2948/// Casts vector of type __m128 to type __m256;
2949/// the upper 128 bits of the result are indeterminate.
2950///
2951/// In the Intel documentation, the upper bits are declared to be "undefined".
2952/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2953/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2956#[inline]
2957#[target_feature(enable = "avx")]
2958// This intrinsic is only used for compilation and does not generate any
2959// instructions, thus it has zero latency.
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2962pub const fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2963    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2964}
2965
2966/// Casts vector of type __m128d to type __m256d;
2967/// the upper 128 bits of the result are indeterminate.
2968///
2969/// In the Intel documentation, the upper bits are declared to be "undefined".
2970/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2971/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2972///
2973/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2974#[inline]
2975#[target_feature(enable = "avx")]
2976// This intrinsic is only used for compilation and does not generate any
2977// instructions, thus it has zero latency.
2978#[stable(feature = "simd_x86", since = "1.27.0")]
2979#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2980pub const fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2981    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2982}
2983
2984/// Casts vector of type __m128i to type __m256i;
2985/// the upper 128 bits of the result are indeterminate.
2986///
2987/// In the Intel documentation, the upper bits are declared to be "undefined".
2988/// This is not equivalent to [`mem::MaybeUninit`]; instead, these bits are non-deterministically
2989/// set to some valid value. In practice, this is typically equivalent to [`mem::zeroed`].
2990///
2991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2992#[inline]
2993#[target_feature(enable = "avx")]
2994// This intrinsic is only used for compilation and does not generate any
2995// instructions, thus it has zero latency.
2996#[stable(feature = "simd_x86", since = "1.27.0")]
2997#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2998pub const fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2999    unsafe {
3000        let a = a.as_i64x2();
3001        let undefined = i64x2::ZERO;
3002        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
3003        transmute(dst)
3004    }
3005}
3006
3007/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
3008/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
3009/// the value of the source vector. The upper 128 bits are set to zero.
3010///
3011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
3012#[inline]
3013#[target_feature(enable = "avx")]
3014// This intrinsic is only used for compilation and does not generate any
3015// instructions, thus it has zero latency.
3016#[stable(feature = "simd_x86", since = "1.27.0")]
3017#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3018pub const fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
3019    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
3020}
3021
3022/// Constructs a 256-bit integer vector from a 128-bit integer vector.
3023/// The lower 128 bits contain the value of the source vector. The upper
3024/// 128 bits are set to zero.
3025///
3026/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
3027#[inline]
3028#[target_feature(enable = "avx")]
3029// This intrinsic is only used for compilation and does not generate any
3030// instructions, thus it has zero latency.
3031#[stable(feature = "simd_x86", since = "1.27.0")]
3032#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3033pub const fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
3034    unsafe {
3035        let b = i64x2::ZERO;
3036        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
3037        transmute(dst)
3038    }
3039}
3040
3041/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
3042/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
3043/// contain the value of the source vector. The upper 128 bits are set
3044/// to zero.
3045///
3046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
3047#[inline]
3048#[target_feature(enable = "avx")]
3049// This intrinsic is only used for compilation and does not generate any
3050// instructions, thus it has zero latency.
3051#[stable(feature = "simd_x86", since = "1.27.0")]
3052#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3053pub const fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
3054    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
3055}
3056
3057/// Returns vector of type `__m256` with indeterminate elements.
3058/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3059/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3060/// In practice, this is typically equivalent to [`mem::zeroed`].
3061///
3062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
3063#[inline]
3064#[target_feature(enable = "avx")]
3065// This intrinsic has no corresponding instruction.
3066#[stable(feature = "simd_x86", since = "1.27.0")]
3067#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3068pub const fn _mm256_undefined_ps() -> __m256 {
3069    const { unsafe { mem::zeroed() } }
3070}
3071
3072/// Returns vector of type `__m256d` with indeterminate elements.
3073/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3074/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3075/// In practice, this is typically equivalent to [`mem::zeroed`].
3076///
3077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
3078#[inline]
3079#[target_feature(enable = "avx")]
3080// This intrinsic has no corresponding instruction.
3081#[stable(feature = "simd_x86", since = "1.27.0")]
3082#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3083pub const fn _mm256_undefined_pd() -> __m256d {
3084    const { unsafe { mem::zeroed() } }
3085}
3086
3087/// Returns vector of type __m256i with with indeterminate elements.
3088/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3089/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3090/// In practice, this is typically equivalent to [`mem::zeroed`].
3091///
3092/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
3093#[inline]
3094#[target_feature(enable = "avx")]
3095// This intrinsic has no corresponding instruction.
3096#[stable(feature = "simd_x86", since = "1.27.0")]
3097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3098pub const fn _mm256_undefined_si256() -> __m256i {
3099    const { unsafe { mem::zeroed() } }
3100}
3101
3102/// Sets packed __m256 returned vector with the supplied values.
3103///
3104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
3105#[inline]
3106#[target_feature(enable = "avx")]
3107#[cfg_attr(test, assert_instr(vinsertf128))]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
3111    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
3112}
3113
3114/// Sets packed __m256d returned vector with the supplied values.
3115///
3116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
3117#[inline]
3118#[target_feature(enable = "avx")]
3119#[cfg_attr(test, assert_instr(vinsertf128))]
3120#[stable(feature = "simd_x86", since = "1.27.0")]
3121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3122pub const fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
3123    unsafe {
3124        let hi: __m128 = transmute(hi);
3125        let lo: __m128 = transmute(lo);
3126        transmute(_mm256_set_m128(hi, lo))
3127    }
3128}
3129
3130/// Sets packed __m256i returned vector with the supplied values.
3131///
3132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
3133#[inline]
3134#[target_feature(enable = "avx")]
3135#[cfg_attr(test, assert_instr(vinsertf128))]
3136#[stable(feature = "simd_x86", since = "1.27.0")]
3137#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3138pub const fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
3139    unsafe {
3140        let hi: __m128 = transmute(hi);
3141        let lo: __m128 = transmute(lo);
3142        transmute(_mm256_set_m128(hi, lo))
3143    }
3144}
3145
3146/// Sets packed __m256 returned vector with the supplied values.
3147///
3148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
3149#[inline]
3150#[target_feature(enable = "avx")]
3151#[cfg_attr(test, assert_instr(vinsertf128))]
3152#[stable(feature = "simd_x86", since = "1.27.0")]
3153#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3154pub const fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
3155    _mm256_set_m128(hi, lo)
3156}
3157
3158/// Sets packed __m256d returned vector with the supplied values.
3159///
3160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
3161#[inline]
3162#[target_feature(enable = "avx")]
3163#[cfg_attr(test, assert_instr(vinsertf128))]
3164#[stable(feature = "simd_x86", since = "1.27.0")]
3165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3166pub const fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
3167    _mm256_set_m128d(hi, lo)
3168}
3169
3170/// Sets packed __m256i returned vector with the supplied values.
3171///
3172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3173#[inline]
3174#[target_feature(enable = "avx")]
3175#[cfg_attr(test, assert_instr(vinsertf128))]
3176#[stable(feature = "simd_x86", since = "1.27.0")]
3177#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3178pub const fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3179    _mm256_set_m128i(hi, lo)
3180}
3181
3182/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3183/// floating-point elements) from memory, and combine them into a 256-bit
3184/// value.
3185/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3186///
3187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3188#[inline]
3189#[target_feature(enable = "avx")]
3190// This intrinsic has no corresponding instruction.
3191#[stable(feature = "simd_x86", since = "1.27.0")]
3192#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3193pub const unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3194    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3195    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3196}
3197
3198/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3199/// floating-point elements) from memory, and combine them into a 256-bit
3200/// value.
3201/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3202///
3203/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3204#[inline]
3205#[target_feature(enable = "avx")]
3206// This intrinsic has no corresponding instruction.
3207#[stable(feature = "simd_x86", since = "1.27.0")]
3208#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3209pub const unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3210    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3211    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3212}
3213
3214/// Loads two 128-bit values (composed of integer data) from memory, and combine
3215/// them into a 256-bit value.
3216/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3217///
3218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3219#[inline]
3220#[target_feature(enable = "avx")]
3221// This intrinsic has no corresponding instruction.
3222#[stable(feature = "simd_x86", since = "1.27.0")]
3223#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3224pub const unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3225    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3226    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3227}
3228
3229/// Stores the high and low 128-bit halves (each composed of 4 packed
3230/// single-precision (32-bit) floating-point elements) from `a` into memory two
3231/// different 128-bit locations.
3232/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3233///
3234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3235#[inline]
3236#[target_feature(enable = "avx")]
3237// This intrinsic has no corresponding instruction.
3238#[stable(feature = "simd_x86", since = "1.27.0")]
3239#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3240pub const unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3241    let lo = _mm256_castps256_ps128(a);
3242    _mm_storeu_ps(loaddr, lo);
3243    let hi = _mm256_extractf128_ps::<1>(a);
3244    _mm_storeu_ps(hiaddr, hi);
3245}
3246
3247/// Stores the high and low 128-bit halves (each composed of 2 packed
3248/// double-precision (64-bit) floating-point elements) from `a` into memory two
3249/// different 128-bit locations.
3250/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3251///
3252/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3253#[inline]
3254#[target_feature(enable = "avx")]
3255// This intrinsic has no corresponding instruction.
3256#[stable(feature = "simd_x86", since = "1.27.0")]
3257#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3258pub const unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3259    let lo = _mm256_castpd256_pd128(a);
3260    _mm_storeu_pd(loaddr, lo);
3261    let hi = _mm256_extractf128_pd::<1>(a);
3262    _mm_storeu_pd(hiaddr, hi);
3263}
3264
3265/// Stores the high and low 128-bit halves (each composed of integer data) from
3266/// `a` into memory two different 128-bit locations.
3267/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3268///
3269/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3270#[inline]
3271#[target_feature(enable = "avx")]
3272// This intrinsic has no corresponding instruction.
3273#[stable(feature = "simd_x86", since = "1.27.0")]
3274#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3275pub const unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3276    let lo = _mm256_castsi256_si128(a);
3277    _mm_storeu_si128(loaddr, lo);
3278    let hi = _mm256_extractf128_si256::<1>(a);
3279    _mm_storeu_si128(hiaddr, hi);
3280}
3281
3282/// Returns the first element of the input vector of `[8 x float]`.
3283///
3284/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3285#[inline]
3286#[target_feature(enable = "avx")]
3287//#[cfg_attr(test, assert_instr(movss))] FIXME
3288#[stable(feature = "simd_x86", since = "1.27.0")]
3289#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3290pub const fn _mm256_cvtss_f32(a: __m256) -> f32 {
3291    unsafe { simd_extract!(a, 0) }
3292}
3293
3294// LLVM intrinsics used in the above functions
3295#[allow(improper_ctypes)]
3296unsafe extern "C" {
3297    #[link_name = "llvm.x86.avx.round.pd.256"]
3298    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3299    #[link_name = "llvm.x86.avx.round.ps.256"]
3300    fn roundps256(a: __m256, b: i32) -> __m256;
3301    #[link_name = "llvm.x86.avx.dp.ps.256"]
3302    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3303    #[link_name = "llvm.x86.sse2.cmp.pd"]
3304    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3305    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3306    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3307    #[link_name = "llvm.x86.sse.cmp.ps"]
3308    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3309    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3310    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3311    #[link_name = "llvm.x86.sse2.cmp.sd"]
3312    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3313    #[link_name = "llvm.x86.sse.cmp.ss"]
3314    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3315    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3316    fn vcvtps2dq(a: __m256) -> i32x8;
3317    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3318    fn vcvttpd2dq(a: __m256d) -> i32x4;
3319    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3320    fn vcvtpd2dq(a: __m256d) -> i32x4;
3321    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3322    fn vcvttps2dq(a: __m256) -> i32x8;
3323    #[link_name = "llvm.x86.avx.vzeroall"]
3324    fn vzeroall();
3325    #[link_name = "llvm.x86.avx.vzeroupper"]
3326    fn vzeroupper();
3327    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3328    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3329    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3330    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3331    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3332    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3333    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3334    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3335    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3336    fn vlddqu(mem_addr: *const i8) -> i8x32;
3337    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3338    fn vrcpps(a: __m256) -> __m256;
3339    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3340    fn vrsqrtps(a: __m256) -> __m256;
3341    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3342    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3343    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3344    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3345    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3346    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3347    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3348    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3349    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3350    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3351    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3352    fn vtestzps256(a: __m256, b: __m256) -> i32;
3353    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3354    fn vtestcps256(a: __m256, b: __m256) -> i32;
3355    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3356    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3357    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3358    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3359    #[link_name = "llvm.x86.avx.min.ps.256"]
3360    fn vminps(a: __m256, b: __m256) -> __m256;
3361    #[link_name = "llvm.x86.avx.max.ps.256"]
3362    fn vmaxps(a: __m256, b: __m256) -> __m256;
3363    #[link_name = "llvm.x86.avx.min.pd.256"]
3364    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3365    #[link_name = "llvm.x86.avx.max.pd.256"]
3366    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3367}
3368
3369#[cfg(test)]
3370mod tests {
3371    use crate::core_arch::assert_eq_const as assert_eq;
3372    use crate::hint::black_box;
3373    use crate::ptr;
3374    use stdarch_test::simd_test;
3375
3376    use crate::core_arch::x86::*;
3377
3378    #[simd_test(enable = "avx")]
3379    const fn test_mm256_add_pd() {
3380        let a = _mm256_setr_pd(1., 2., 3., 4.);
3381        let b = _mm256_setr_pd(5., 6., 7., 8.);
3382        let r = _mm256_add_pd(a, b);
3383        let e = _mm256_setr_pd(6., 8., 10., 12.);
3384        assert_eq_m256d(r, e);
3385    }
3386
3387    #[simd_test(enable = "avx")]
3388    const fn test_mm256_add_ps() {
3389        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3390        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3391        let r = _mm256_add_ps(a, b);
3392        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3393        assert_eq_m256(r, e);
3394    }
3395
3396    #[simd_test(enable = "avx")]
3397    const fn test_mm256_and_pd() {
3398        let a = _mm256_set1_pd(1.);
3399        let b = _mm256_set1_pd(0.6);
3400        let r = _mm256_and_pd(a, b);
3401        let e = _mm256_set1_pd(0.5);
3402        assert_eq_m256d(r, e);
3403    }
3404
3405    #[simd_test(enable = "avx")]
3406    const fn test_mm256_and_ps() {
3407        let a = _mm256_set1_ps(1.);
3408        let b = _mm256_set1_ps(0.6);
3409        let r = _mm256_and_ps(a, b);
3410        let e = _mm256_set1_ps(0.5);
3411        assert_eq_m256(r, e);
3412    }
3413
3414    #[simd_test(enable = "avx")]
3415    const fn test_mm256_or_pd() {
3416        let a = _mm256_set1_pd(1.);
3417        let b = _mm256_set1_pd(0.6);
3418        let r = _mm256_or_pd(a, b);
3419        let e = _mm256_set1_pd(1.2);
3420        assert_eq_m256d(r, e);
3421    }
3422
3423    #[simd_test(enable = "avx")]
3424    const fn test_mm256_or_ps() {
3425        let a = _mm256_set1_ps(1.);
3426        let b = _mm256_set1_ps(0.6);
3427        let r = _mm256_or_ps(a, b);
3428        let e = _mm256_set1_ps(1.2);
3429        assert_eq_m256(r, e);
3430    }
3431
3432    #[simd_test(enable = "avx")]
3433    const fn test_mm256_shuffle_pd() {
3434        let a = _mm256_setr_pd(1., 4., 5., 8.);
3435        let b = _mm256_setr_pd(2., 3., 6., 7.);
3436        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3437        let e = _mm256_setr_pd(4., 3., 8., 7.);
3438        assert_eq_m256d(r, e);
3439    }
3440
3441    #[simd_test(enable = "avx")]
3442    const fn test_mm256_shuffle_ps() {
3443        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3444        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3445        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3446        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3447        assert_eq_m256(r, e);
3448    }
3449
3450    #[simd_test(enable = "avx")]
3451    const fn test_mm256_andnot_pd() {
3452        let a = _mm256_set1_pd(0.);
3453        let b = _mm256_set1_pd(0.6);
3454        let r = _mm256_andnot_pd(a, b);
3455        assert_eq_m256d(r, b);
3456    }
3457
3458    #[simd_test(enable = "avx")]
3459    const fn test_mm256_andnot_ps() {
3460        let a = _mm256_set1_ps(0.);
3461        let b = _mm256_set1_ps(0.6);
3462        let r = _mm256_andnot_ps(a, b);
3463        assert_eq_m256(r, b);
3464    }
3465
3466    #[simd_test(enable = "avx")]
3467    unsafe fn test_mm256_max_pd() {
3468        let a = _mm256_setr_pd(1., 4., 5., 8.);
3469        let b = _mm256_setr_pd(2., 3., 6., 7.);
3470        let r = _mm256_max_pd(a, b);
3471        let e = _mm256_setr_pd(2., 4., 6., 8.);
3472        assert_eq_m256d(r, e);
3473        // > If the values being compared are both 0.0s (of either sign), the
3474        // > value in the second operand (source operand) is returned.
3475        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3476        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3477        let wu: [u64; 4] = transmute(w);
3478        let xu: [u64; 4] = transmute(x);
3479        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3480        assert_eq!(xu, [0u64; 4]);
3481        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3482        // > second operand (source operand), either a NaN or a valid
3483        // > floating-point value, is written to the result.
3484        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3485        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3486        let yf: [f64; 4] = transmute(y);
3487        let zf: [f64; 4] = transmute(z);
3488        assert_eq!(yf, [0.0; 4]);
3489        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3490    }
3491
3492    #[simd_test(enable = "avx")]
3493    unsafe fn test_mm256_max_ps() {
3494        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3495        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3496        let r = _mm256_max_ps(a, b);
3497        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3498        assert_eq_m256(r, e);
3499        // > If the values being compared are both 0.0s (of either sign), the
3500        // > value in the second operand (source operand) is returned.
3501        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3502        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3503        let wu: [u32; 8] = transmute(w);
3504        let xu: [u32; 8] = transmute(x);
3505        assert_eq!(wu, [0x8000_0000u32; 8]);
3506        assert_eq!(xu, [0u32; 8]);
3507        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3508        // > second operand (source operand), either a NaN or a valid
3509        // > floating-point value, is written to the result.
3510        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3511        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3512        let yf: [f32; 8] = transmute(y);
3513        let zf: [f32; 8] = transmute(z);
3514        assert_eq!(yf, [0.0; 8]);
3515        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3516    }
3517
3518    #[simd_test(enable = "avx")]
3519    unsafe fn test_mm256_min_pd() {
3520        let a = _mm256_setr_pd(1., 4., 5., 8.);
3521        let b = _mm256_setr_pd(2., 3., 6., 7.);
3522        let r = _mm256_min_pd(a, b);
3523        let e = _mm256_setr_pd(1., 3., 5., 7.);
3524        assert_eq_m256d(r, e);
3525        // > If the values being compared are both 0.0s (of either sign), the
3526        // > value in the second operand (source operand) is returned.
3527        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3528        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3529        let wu: [u64; 4] = transmute(w);
3530        let xu: [u64; 4] = transmute(x);
3531        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3532        assert_eq!(xu, [0u64; 4]);
3533        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3534        // > second operand (source operand), either a NaN or a valid
3535        // > floating-point value, is written to the result.
3536        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3537        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3538        let yf: [f64; 4] = transmute(y);
3539        let zf: [f64; 4] = transmute(z);
3540        assert_eq!(yf, [0.0; 4]);
3541        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3542    }
3543
3544    #[simd_test(enable = "avx")]
3545    unsafe fn test_mm256_min_ps() {
3546        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3547        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3548        let r = _mm256_min_ps(a, b);
3549        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3550        assert_eq_m256(r, e);
3551        // > If the values being compared are both 0.0s (of either sign), the
3552        // > value in the second operand (source operand) is returned.
3553        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3554        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3555        let wu: [u32; 8] = transmute(w);
3556        let xu: [u32; 8] = transmute(x);
3557        assert_eq!(wu, [0x8000_0000u32; 8]);
3558        assert_eq!(xu, [0u32; 8]);
3559        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3560        // > second operand (source operand), either a NaN or a valid
3561        // > floating-point value, is written to the result.
3562        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3563        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3564        let yf: [f32; 8] = transmute(y);
3565        let zf: [f32; 8] = transmute(z);
3566        assert_eq!(yf, [0.0; 8]);
3567        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3568    }
3569
3570    #[simd_test(enable = "avx")]
3571    const fn test_mm256_mul_pd() {
3572        let a = _mm256_setr_pd(1., 2., 3., 4.);
3573        let b = _mm256_setr_pd(5., 6., 7., 8.);
3574        let r = _mm256_mul_pd(a, b);
3575        let e = _mm256_setr_pd(5., 12., 21., 32.);
3576        assert_eq_m256d(r, e);
3577    }
3578
3579    #[simd_test(enable = "avx")]
3580    const fn test_mm256_mul_ps() {
3581        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3582        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3583        let r = _mm256_mul_ps(a, b);
3584        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3585        assert_eq_m256(r, e);
3586    }
3587
3588    #[simd_test(enable = "avx")]
3589    const fn test_mm256_addsub_pd() {
3590        let a = _mm256_setr_pd(1., 2., 3., 4.);
3591        let b = _mm256_setr_pd(5., 6., 7., 8.);
3592        let r = _mm256_addsub_pd(a, b);
3593        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3594        assert_eq_m256d(r, e);
3595    }
3596
3597    #[simd_test(enable = "avx")]
3598    const fn test_mm256_addsub_ps() {
3599        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3600        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3601        let r = _mm256_addsub_ps(a, b);
3602        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3603        assert_eq_m256(r, e);
3604    }
3605
3606    #[simd_test(enable = "avx")]
3607    const fn test_mm256_sub_pd() {
3608        let a = _mm256_setr_pd(1., 2., 3., 4.);
3609        let b = _mm256_setr_pd(5., 6., 7., 8.);
3610        let r = _mm256_sub_pd(a, b);
3611        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3612        assert_eq_m256d(r, e);
3613    }
3614
3615    #[simd_test(enable = "avx")]
3616    const fn test_mm256_sub_ps() {
3617        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3618        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3619        let r = _mm256_sub_ps(a, b);
3620        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3621        assert_eq_m256(r, e);
3622    }
3623
3624    #[simd_test(enable = "avx")]
3625    fn test_mm256_round_pd() {
3626        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3627        let result_closest = _mm256_round_pd::<0b0000>(a);
3628        let result_down = _mm256_round_pd::<0b0001>(a);
3629        let result_up = _mm256_round_pd::<0b0010>(a);
3630        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3631        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3632        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3633        assert_eq_m256d(result_closest, expected_closest);
3634        assert_eq_m256d(result_down, expected_down);
3635        assert_eq_m256d(result_up, expected_up);
3636    }
3637
3638    #[simd_test(enable = "avx")]
3639    const fn test_mm256_floor_pd() {
3640        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3641        let result_down = _mm256_floor_pd(a);
3642        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3643        assert_eq_m256d(result_down, expected_down);
3644    }
3645
3646    #[simd_test(enable = "avx")]
3647    const fn test_mm256_ceil_pd() {
3648        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3649        let result_up = _mm256_ceil_pd(a);
3650        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3651        assert_eq_m256d(result_up, expected_up);
3652    }
3653
3654    #[simd_test(enable = "avx")]
3655    fn test_mm256_round_ps() {
3656        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3657        let result_closest = _mm256_round_ps::<0b0000>(a);
3658        let result_down = _mm256_round_ps::<0b0001>(a);
3659        let result_up = _mm256_round_ps::<0b0010>(a);
3660        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3661        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3662        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3663        assert_eq_m256(result_closest, expected_closest);
3664        assert_eq_m256(result_down, expected_down);
3665        assert_eq_m256(result_up, expected_up);
3666    }
3667
3668    #[simd_test(enable = "avx")]
3669    const fn test_mm256_floor_ps() {
3670        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3671        let result_down = _mm256_floor_ps(a);
3672        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3673        assert_eq_m256(result_down, expected_down);
3674    }
3675
3676    #[simd_test(enable = "avx")]
3677    const fn test_mm256_ceil_ps() {
3678        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3679        let result_up = _mm256_ceil_ps(a);
3680        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3681        assert_eq_m256(result_up, expected_up);
3682    }
3683
3684    #[simd_test(enable = "avx")]
3685    fn test_mm256_sqrt_pd() {
3686        let a = _mm256_setr_pd(4., 9., 16., 25.);
3687        let r = _mm256_sqrt_pd(a);
3688        let e = _mm256_setr_pd(2., 3., 4., 5.);
3689        assert_eq_m256d(r, e);
3690    }
3691
3692    #[simd_test(enable = "avx")]
3693    fn test_mm256_sqrt_ps() {
3694        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3695        let r = _mm256_sqrt_ps(a);
3696        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3697        assert_eq_m256(r, e);
3698    }
3699
3700    #[simd_test(enable = "avx")]
3701    const fn test_mm256_div_ps() {
3702        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3703        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3704        let r = _mm256_div_ps(a, b);
3705        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3706        assert_eq_m256(r, e);
3707    }
3708
3709    #[simd_test(enable = "avx")]
3710    const fn test_mm256_div_pd() {
3711        let a = _mm256_setr_pd(4., 9., 16., 25.);
3712        let b = _mm256_setr_pd(4., 3., 2., 5.);
3713        let r = _mm256_div_pd(a, b);
3714        let e = _mm256_setr_pd(1., 3., 8., 5.);
3715        assert_eq_m256d(r, e);
3716    }
3717
3718    #[simd_test(enable = "avx")]
3719    const fn test_mm256_blend_pd() {
3720        let a = _mm256_setr_pd(4., 9., 16., 25.);
3721        let b = _mm256_setr_pd(4., 3., 2., 5.);
3722        let r = _mm256_blend_pd::<0x0>(a, b);
3723        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3724        let r = _mm256_blend_pd::<0x3>(a, b);
3725        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3726        let r = _mm256_blend_pd::<0xF>(a, b);
3727        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3728    }
3729
3730    #[simd_test(enable = "avx")]
3731    const fn test_mm256_blend_ps() {
3732        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3733        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3734        let r = _mm256_blend_ps::<0x0>(a, b);
3735        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3736        let r = _mm256_blend_ps::<0x3>(a, b);
3737        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3738        let r = _mm256_blend_ps::<0xF>(a, b);
3739        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3740    }
3741
3742    #[simd_test(enable = "avx")]
3743    const fn test_mm256_blendv_pd() {
3744        let a = _mm256_setr_pd(4., 9., 16., 25.);
3745        let b = _mm256_setr_pd(4., 3., 2., 5.);
3746        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3747        let r = _mm256_blendv_pd(a, b, c);
3748        let e = _mm256_setr_pd(4., 9., 2., 5.);
3749        assert_eq_m256d(r, e);
3750    }
3751
3752    #[simd_test(enable = "avx")]
3753    const fn test_mm256_blendv_ps() {
3754        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3755        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3756        #[rustfmt::skip]
3757        let c = _mm256_setr_ps(
3758            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3759        );
3760        let r = _mm256_blendv_ps(a, b, c);
3761        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3762        assert_eq_m256(r, e);
3763    }
3764
3765    #[simd_test(enable = "avx")]
3766    fn test_mm256_dp_ps() {
3767        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3768        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3769        let r = _mm256_dp_ps::<0xFF>(a, b);
3770        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3771        assert_eq_m256(r, e);
3772    }
3773
3774    #[simd_test(enable = "avx")]
3775    const fn test_mm256_hadd_pd() {
3776        let a = _mm256_setr_pd(4., 9., 16., 25.);
3777        let b = _mm256_setr_pd(4., 3., 2., 5.);
3778        let r = _mm256_hadd_pd(a, b);
3779        let e = _mm256_setr_pd(13., 7., 41., 7.);
3780        assert_eq_m256d(r, e);
3781
3782        let a = _mm256_setr_pd(1., 2., 3., 4.);
3783        let b = _mm256_setr_pd(5., 6., 7., 8.);
3784        let r = _mm256_hadd_pd(a, b);
3785        let e = _mm256_setr_pd(3., 11., 7., 15.);
3786        assert_eq_m256d(r, e);
3787    }
3788
3789    #[simd_test(enable = "avx")]
3790    const fn test_mm256_hadd_ps() {
3791        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3792        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3793        let r = _mm256_hadd_ps(a, b);
3794        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3795        assert_eq_m256(r, e);
3796
3797        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3798        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3799        let r = _mm256_hadd_ps(a, b);
3800        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3801        assert_eq_m256(r, e);
3802    }
3803
3804    #[simd_test(enable = "avx")]
3805    const fn test_mm256_hsub_pd() {
3806        let a = _mm256_setr_pd(4., 9., 16., 25.);
3807        let b = _mm256_setr_pd(4., 3., 2., 5.);
3808        let r = _mm256_hsub_pd(a, b);
3809        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3810        assert_eq_m256d(r, e);
3811
3812        let a = _mm256_setr_pd(1., 2., 3., 4.);
3813        let b = _mm256_setr_pd(5., 6., 7., 8.);
3814        let r = _mm256_hsub_pd(a, b);
3815        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3816        assert_eq_m256d(r, e);
3817    }
3818
3819    #[simd_test(enable = "avx")]
3820    const fn test_mm256_hsub_ps() {
3821        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3822        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3823        let r = _mm256_hsub_ps(a, b);
3824        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3825        assert_eq_m256(r, e);
3826
3827        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3828        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3829        let r = _mm256_hsub_ps(a, b);
3830        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3831        assert_eq_m256(r, e);
3832    }
3833
3834    #[simd_test(enable = "avx")]
3835    const fn test_mm256_xor_pd() {
3836        let a = _mm256_setr_pd(4., 9., 16., 25.);
3837        let b = _mm256_set1_pd(0.);
3838        let r = _mm256_xor_pd(a, b);
3839        assert_eq_m256d(r, a);
3840    }
3841
3842    #[simd_test(enable = "avx")]
3843    const fn test_mm256_xor_ps() {
3844        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3845        let b = _mm256_set1_ps(0.);
3846        let r = _mm256_xor_ps(a, b);
3847        assert_eq_m256(r, a);
3848    }
3849
3850    #[simd_test(enable = "avx")]
3851    fn test_mm_cmp_pd() {
3852        let a = _mm_setr_pd(4., 9.);
3853        let b = _mm_setr_pd(4., 3.);
3854        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3855        assert!(get_m128d(r, 0).is_nan());
3856        assert!(get_m128d(r, 1).is_nan());
3857    }
3858
3859    #[simd_test(enable = "avx")]
3860    fn test_mm256_cmp_pd() {
3861        let a = _mm256_setr_pd(1., 2., 3., 4.);
3862        let b = _mm256_setr_pd(5., 6., 7., 8.);
3863        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3864        let e = _mm256_set1_pd(0.);
3865        assert_eq_m256d(r, e);
3866    }
3867
3868    #[simd_test(enable = "avx")]
3869    fn test_mm_cmp_ps() {
3870        let a = _mm_setr_ps(4., 3., 2., 5.);
3871        let b = _mm_setr_ps(4., 9., 16., 25.);
3872        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3873        assert!(get_m128(r, 0).is_nan());
3874        assert_eq!(get_m128(r, 1), 0.);
3875        assert_eq!(get_m128(r, 2), 0.);
3876        assert_eq!(get_m128(r, 3), 0.);
3877    }
3878
3879    #[simd_test(enable = "avx")]
3880    fn test_mm256_cmp_ps() {
3881        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3882        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3883        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3884        let e = _mm256_set1_ps(0.);
3885        assert_eq_m256(r, e);
3886    }
3887
3888    #[simd_test(enable = "avx")]
3889    fn test_mm_cmp_sd() {
3890        let a = _mm_setr_pd(4., 9.);
3891        let b = _mm_setr_pd(4., 3.);
3892        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3893        assert!(get_m128d(r, 0).is_nan());
3894        assert_eq!(get_m128d(r, 1), 9.);
3895    }
3896
3897    #[simd_test(enable = "avx")]
3898    fn test_mm_cmp_ss() {
3899        let a = _mm_setr_ps(4., 3., 2., 5.);
3900        let b = _mm_setr_ps(4., 9., 16., 25.);
3901        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3902        assert!(get_m128(r, 0).is_nan());
3903        assert_eq!(get_m128(r, 1), 3.);
3904        assert_eq!(get_m128(r, 2), 2.);
3905        assert_eq!(get_m128(r, 3), 5.);
3906    }
3907
3908    #[simd_test(enable = "avx")]
3909    const fn test_mm256_cvtepi32_pd() {
3910        let a = _mm_setr_epi32(4, 9, 16, 25);
3911        let r = _mm256_cvtepi32_pd(a);
3912        let e = _mm256_setr_pd(4., 9., 16., 25.);
3913        assert_eq_m256d(r, e);
3914    }
3915
3916    #[simd_test(enable = "avx")]
3917    const fn test_mm256_cvtepi32_ps() {
3918        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3919        let r = _mm256_cvtepi32_ps(a);
3920        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3921        assert_eq_m256(r, e);
3922    }
3923
3924    #[simd_test(enable = "avx")]
3925    const fn test_mm256_cvtpd_ps() {
3926        let a = _mm256_setr_pd(4., 9., 16., 25.);
3927        let r = _mm256_cvtpd_ps(a);
3928        let e = _mm_setr_ps(4., 9., 16., 25.);
3929        assert_eq_m128(r, e);
3930    }
3931
3932    #[simd_test(enable = "avx")]
3933    fn test_mm256_cvtps_epi32() {
3934        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3935        let r = _mm256_cvtps_epi32(a);
3936        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3937        assert_eq_m256i(r, e);
3938    }
3939
3940    #[simd_test(enable = "avx")]
3941    const fn test_mm256_cvtps_pd() {
3942        let a = _mm_setr_ps(4., 9., 16., 25.);
3943        let r = _mm256_cvtps_pd(a);
3944        let e = _mm256_setr_pd(4., 9., 16., 25.);
3945        assert_eq_m256d(r, e);
3946    }
3947
3948    #[simd_test(enable = "avx")]
3949    const fn test_mm256_cvtsd_f64() {
3950        let a = _mm256_setr_pd(1., 2., 3., 4.);
3951        let r = _mm256_cvtsd_f64(a);
3952        assert_eq!(r, 1.);
3953    }
3954
3955    #[simd_test(enable = "avx")]
3956    fn test_mm256_cvttpd_epi32() {
3957        let a = _mm256_setr_pd(4., 9., 16., 25.);
3958        let r = _mm256_cvttpd_epi32(a);
3959        let e = _mm_setr_epi32(4, 9, 16, 25);
3960        assert_eq_m128i(r, e);
3961    }
3962
3963    #[simd_test(enable = "avx")]
3964    fn test_mm256_cvtpd_epi32() {
3965        let a = _mm256_setr_pd(4., 9., 16., 25.);
3966        let r = _mm256_cvtpd_epi32(a);
3967        let e = _mm_setr_epi32(4, 9, 16, 25);
3968        assert_eq_m128i(r, e);
3969    }
3970
3971    #[simd_test(enable = "avx")]
3972    fn test_mm256_cvttps_epi32() {
3973        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3974        let r = _mm256_cvttps_epi32(a);
3975        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3976        assert_eq_m256i(r, e);
3977    }
3978
3979    #[simd_test(enable = "avx")]
3980    const fn test_mm256_extractf128_ps() {
3981        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3982        let r = _mm256_extractf128_ps::<0>(a);
3983        let e = _mm_setr_ps(4., 3., 2., 5.);
3984        assert_eq_m128(r, e);
3985    }
3986
3987    #[simd_test(enable = "avx")]
3988    const fn test_mm256_extractf128_pd() {
3989        let a = _mm256_setr_pd(4., 3., 2., 5.);
3990        let r = _mm256_extractf128_pd::<0>(a);
3991        let e = _mm_setr_pd(4., 3.);
3992        assert_eq_m128d(r, e);
3993    }
3994
3995    #[simd_test(enable = "avx")]
3996    const fn test_mm256_extractf128_si256() {
3997        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3998        let r = _mm256_extractf128_si256::<0>(a);
3999        let e = _mm_setr_epi64x(4, 3);
4000        assert_eq_m128i(r, e);
4001    }
4002
4003    #[simd_test(enable = "avx")]
4004    const fn test_mm256_extract_epi32() {
4005        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
4006        let r1 = _mm256_extract_epi32::<0>(a);
4007        let r2 = _mm256_extract_epi32::<3>(a);
4008        assert_eq!(r1, -1);
4009        assert_eq!(r2, 3);
4010    }
4011
4012    #[simd_test(enable = "avx")]
4013    const fn test_mm256_cvtsi256_si32() {
4014        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4015        let r = _mm256_cvtsi256_si32(a);
4016        assert_eq!(r, 1);
4017    }
4018
4019    #[simd_test(enable = "avx")]
4020    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4021    fn test_mm256_zeroall() {
4022        _mm256_zeroall();
4023    }
4024
4025    #[simd_test(enable = "avx")]
4026    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
4027    fn test_mm256_zeroupper() {
4028        _mm256_zeroupper();
4029    }
4030
4031    #[simd_test(enable = "avx")]
4032    fn test_mm256_permutevar_ps() {
4033        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4034        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4035        let r = _mm256_permutevar_ps(a, b);
4036        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
4037        assert_eq_m256(r, e);
4038    }
4039
4040    #[simd_test(enable = "avx")]
4041    fn test_mm_permutevar_ps() {
4042        let a = _mm_setr_ps(4., 3., 2., 5.);
4043        let b = _mm_setr_epi32(1, 2, 3, 4);
4044        let r = _mm_permutevar_ps(a, b);
4045        let e = _mm_setr_ps(3., 2., 5., 4.);
4046        assert_eq_m128(r, e);
4047    }
4048
4049    #[simd_test(enable = "avx")]
4050    const fn test_mm256_permute_ps() {
4051        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4052        let r = _mm256_permute_ps::<0x1b>(a);
4053        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
4054        assert_eq_m256(r, e);
4055    }
4056
4057    #[simd_test(enable = "avx")]
4058    const fn test_mm_permute_ps() {
4059        let a = _mm_setr_ps(4., 3., 2., 5.);
4060        let r = _mm_permute_ps::<0x1b>(a);
4061        let e = _mm_setr_ps(5., 2., 3., 4.);
4062        assert_eq_m128(r, e);
4063    }
4064
4065    #[simd_test(enable = "avx")]
4066    fn test_mm256_permutevar_pd() {
4067        let a = _mm256_setr_pd(4., 3., 2., 5.);
4068        let b = _mm256_setr_epi64x(1, 2, 3, 4);
4069        let r = _mm256_permutevar_pd(a, b);
4070        let e = _mm256_setr_pd(4., 3., 5., 2.);
4071        assert_eq_m256d(r, e);
4072    }
4073
4074    #[simd_test(enable = "avx")]
4075    fn test_mm_permutevar_pd() {
4076        let a = _mm_setr_pd(4., 3.);
4077        let b = _mm_setr_epi64x(3, 0);
4078        let r = _mm_permutevar_pd(a, b);
4079        let e = _mm_setr_pd(3., 4.);
4080        assert_eq_m128d(r, e);
4081    }
4082
4083    #[simd_test(enable = "avx")]
4084    const fn test_mm256_permute_pd() {
4085        let a = _mm256_setr_pd(4., 3., 2., 5.);
4086        let r = _mm256_permute_pd::<5>(a);
4087        let e = _mm256_setr_pd(3., 4., 5., 2.);
4088        assert_eq_m256d(r, e);
4089    }
4090
4091    #[simd_test(enable = "avx")]
4092    const fn test_mm_permute_pd() {
4093        let a = _mm_setr_pd(4., 3.);
4094        let r = _mm_permute_pd::<1>(a);
4095        let e = _mm_setr_pd(3., 4.);
4096        assert_eq_m128d(r, e);
4097    }
4098
4099    #[simd_test(enable = "avx")]
4100    const fn test_mm256_permute2f128_ps() {
4101        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
4102        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
4103        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
4104        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
4105        assert_eq_m256(r, e);
4106
4107        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4108        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
4109        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
4110        assert_eq_m256(r, z);
4111    }
4112
4113    #[simd_test(enable = "avx")]
4114    const fn test_mm256_permute2f128_pd() {
4115        let a = _mm256_setr_pd(1., 2., 3., 4.);
4116        let b = _mm256_setr_pd(5., 6., 7., 8.);
4117        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
4118        let e = _mm256_setr_pd(3., 4., 7., 8.);
4119        assert_eq_m256d(r, e);
4120
4121        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4122        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
4123        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
4124        assert_eq_m256d(r, e);
4125    }
4126
4127    #[simd_test(enable = "avx")]
4128    const fn test_mm256_permute2f128_si256() {
4129        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
4130        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
4131        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
4132        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
4133        assert_eq_m256i(r, e);
4134
4135        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
4136        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
4137        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
4138        assert_eq_m256i(r, e);
4139    }
4140
4141    #[simd_test(enable = "avx")]
4142    const fn test_mm256_broadcast_ss() {
4143        let r = _mm256_broadcast_ss(&3.);
4144        let e = _mm256_set1_ps(3.);
4145        assert_eq_m256(r, e);
4146    }
4147
4148    #[simd_test(enable = "avx")]
4149    const fn test_mm_broadcast_ss() {
4150        let r = _mm_broadcast_ss(&3.);
4151        let e = _mm_set1_ps(3.);
4152        assert_eq_m128(r, e);
4153    }
4154
4155    #[simd_test(enable = "avx")]
4156    const fn test_mm256_broadcast_sd() {
4157        let r = _mm256_broadcast_sd(&3.);
4158        let e = _mm256_set1_pd(3.);
4159        assert_eq_m256d(r, e);
4160    }
4161
4162    #[simd_test(enable = "avx")]
4163    const fn test_mm256_broadcast_ps() {
4164        let a = _mm_setr_ps(4., 3., 2., 5.);
4165        let r = _mm256_broadcast_ps(&a);
4166        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
4167        assert_eq_m256(r, e);
4168    }
4169
4170    #[simd_test(enable = "avx")]
4171    const fn test_mm256_broadcast_pd() {
4172        let a = _mm_setr_pd(4., 3.);
4173        let r = _mm256_broadcast_pd(&a);
4174        let e = _mm256_setr_pd(4., 3., 4., 3.);
4175        assert_eq_m256d(r, e);
4176    }
4177
4178    #[simd_test(enable = "avx")]
4179    const fn test_mm256_insertf128_ps() {
4180        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4181        let b = _mm_setr_ps(4., 9., 16., 25.);
4182        let r = _mm256_insertf128_ps::<0>(a, b);
4183        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4184        assert_eq_m256(r, e);
4185    }
4186
4187    #[simd_test(enable = "avx")]
4188    const fn test_mm256_insertf128_pd() {
4189        let a = _mm256_setr_pd(1., 2., 3., 4.);
4190        let b = _mm_setr_pd(5., 6.);
4191        let r = _mm256_insertf128_pd::<0>(a, b);
4192        let e = _mm256_setr_pd(5., 6., 3., 4.);
4193        assert_eq_m256d(r, e);
4194    }
4195
4196    #[simd_test(enable = "avx")]
4197    const fn test_mm256_insertf128_si256() {
4198        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4199        let b = _mm_setr_epi64x(5, 6);
4200        let r = _mm256_insertf128_si256::<0>(a, b);
4201        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4202        assert_eq_m256i(r, e);
4203    }
4204
4205    #[simd_test(enable = "avx")]
4206    const fn test_mm256_insert_epi8() {
4207        #[rustfmt::skip]
4208        let a = _mm256_setr_epi8(
4209            1, 2, 3, 4, 5, 6, 7, 8,
4210            9, 10, 11, 12, 13, 14, 15, 16,
4211            17, 18, 19, 20, 21, 22, 23, 24,
4212            25, 26, 27, 28, 29, 30, 31, 32,
4213        );
4214        let r = _mm256_insert_epi8::<31>(a, 0);
4215        #[rustfmt::skip]
4216        let e = _mm256_setr_epi8(
4217            1, 2, 3, 4, 5, 6, 7, 8,
4218            9, 10, 11, 12, 13, 14, 15, 16,
4219            17, 18, 19, 20, 21, 22, 23, 24,
4220            25, 26, 27, 28, 29, 30, 31, 0,
4221        );
4222        assert_eq_m256i(r, e);
4223    }
4224
4225    #[simd_test(enable = "avx")]
4226    const fn test_mm256_insert_epi16() {
4227        #[rustfmt::skip]
4228        let a = _mm256_setr_epi16(
4229            0, 1, 2, 3, 4, 5, 6, 7,
4230            8, 9, 10, 11, 12, 13, 14, 15,
4231        );
4232        let r = _mm256_insert_epi16::<15>(a, 0);
4233        #[rustfmt::skip]
4234        let e = _mm256_setr_epi16(
4235            0, 1, 2, 3, 4, 5, 6, 7,
4236            8, 9, 10, 11, 12, 13, 14, 0,
4237        );
4238        assert_eq_m256i(r, e);
4239    }
4240
4241    #[simd_test(enable = "avx")]
4242    const fn test_mm256_insert_epi32() {
4243        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4244        let r = _mm256_insert_epi32::<7>(a, 0);
4245        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4246        assert_eq_m256i(r, e);
4247    }
4248
4249    #[simd_test(enable = "avx")]
4250    const unsafe fn test_mm256_load_pd() {
4251        let a = _mm256_setr_pd(1., 2., 3., 4.);
4252        let p = ptr::addr_of!(a) as *const f64;
4253        let r = _mm256_load_pd(p);
4254        let e = _mm256_setr_pd(1., 2., 3., 4.);
4255        assert_eq_m256d(r, e);
4256    }
4257
4258    #[simd_test(enable = "avx")]
4259    const unsafe fn test_mm256_store_pd() {
4260        let a = _mm256_setr_pd(1., 2., 3., 4.);
4261        let mut r = _mm256_undefined_pd();
4262        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4263        assert_eq_m256d(r, a);
4264    }
4265
4266    #[simd_test(enable = "avx")]
4267    const unsafe fn test_mm256_load_ps() {
4268        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4269        let p = ptr::addr_of!(a) as *const f32;
4270        let r = _mm256_load_ps(p);
4271        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4272        assert_eq_m256(r, e);
4273    }
4274
4275    #[simd_test(enable = "avx")]
4276    const unsafe fn test_mm256_store_ps() {
4277        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4278        let mut r = _mm256_undefined_ps();
4279        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4280        assert_eq_m256(r, a);
4281    }
4282
4283    #[simd_test(enable = "avx")]
4284    const unsafe fn test_mm256_loadu_pd() {
4285        let a = &[1.0f64, 2., 3., 4.];
4286        let p = a.as_ptr();
4287        let r = _mm256_loadu_pd(black_box(p));
4288        let e = _mm256_setr_pd(1., 2., 3., 4.);
4289        assert_eq_m256d(r, e);
4290    }
4291
4292    #[simd_test(enable = "avx")]
4293    const unsafe fn test_mm256_storeu_pd() {
4294        let a = _mm256_set1_pd(9.);
4295        let mut r = _mm256_undefined_pd();
4296        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4297        assert_eq_m256d(r, a);
4298    }
4299
4300    #[simd_test(enable = "avx")]
4301    const unsafe fn test_mm256_loadu_ps() {
4302        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4303        let p = a.as_ptr();
4304        let r = _mm256_loadu_ps(black_box(p));
4305        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4306        assert_eq_m256(r, e);
4307    }
4308
4309    #[simd_test(enable = "avx")]
4310    const unsafe fn test_mm256_storeu_ps() {
4311        let a = _mm256_set1_ps(9.);
4312        let mut r = _mm256_undefined_ps();
4313        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4314        assert_eq_m256(r, a);
4315    }
4316
4317    #[simd_test(enable = "avx")]
4318    const unsafe fn test_mm256_load_si256() {
4319        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4320        let p = ptr::addr_of!(a);
4321        let r = _mm256_load_si256(p);
4322        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4323        assert_eq_m256i(r, e);
4324    }
4325
4326    #[simd_test(enable = "avx")]
4327    const unsafe fn test_mm256_store_si256() {
4328        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4329        let mut r = _mm256_undefined_si256();
4330        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4331        assert_eq_m256i(r, a);
4332    }
4333
4334    #[simd_test(enable = "avx")]
4335    const unsafe fn test_mm256_loadu_si256() {
4336        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4337        let p = ptr::addr_of!(a);
4338        let r = _mm256_loadu_si256(black_box(p));
4339        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4340        assert_eq_m256i(r, e);
4341    }
4342
4343    #[simd_test(enable = "avx")]
4344    const unsafe fn test_mm256_storeu_si256() {
4345        let a = _mm256_set1_epi8(9);
4346        let mut r = _mm256_undefined_si256();
4347        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4348        assert_eq_m256i(r, a);
4349    }
4350
4351    #[simd_test(enable = "avx")]
4352    const unsafe fn test_mm256_maskload_pd() {
4353        let a = &[1.0f64, 2., 3., 4.];
4354        let p = a.as_ptr();
4355        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4356        let r = _mm256_maskload_pd(black_box(p), mask);
4357        let e = _mm256_setr_pd(0., 2., 0., 4.);
4358        assert_eq_m256d(r, e);
4359    }
4360
4361    #[simd_test(enable = "avx")]
4362    const unsafe fn test_mm256_maskstore_pd() {
4363        let mut r = _mm256_set1_pd(0.);
4364        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4365        let a = _mm256_setr_pd(1., 2., 3., 4.);
4366        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4367        let e = _mm256_setr_pd(0., 2., 0., 4.);
4368        assert_eq_m256d(r, e);
4369    }
4370
4371    #[simd_test(enable = "avx")]
4372    const unsafe fn test_mm_maskload_pd() {
4373        let a = &[1.0f64, 2.];
4374        let p = a.as_ptr();
4375        let mask = _mm_setr_epi64x(0, !0);
4376        let r = _mm_maskload_pd(black_box(p), mask);
4377        let e = _mm_setr_pd(0., 2.);
4378        assert_eq_m128d(r, e);
4379    }
4380
4381    #[simd_test(enable = "avx")]
4382    const unsafe fn test_mm_maskstore_pd() {
4383        let mut r = _mm_set1_pd(0.);
4384        let mask = _mm_setr_epi64x(0, !0);
4385        let a = _mm_setr_pd(1., 2.);
4386        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4387        let e = _mm_setr_pd(0., 2.);
4388        assert_eq_m128d(r, e);
4389    }
4390
4391    #[simd_test(enable = "avx")]
4392    const unsafe fn test_mm256_maskload_ps() {
4393        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4394        let p = a.as_ptr();
4395        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4396        let r = _mm256_maskload_ps(black_box(p), mask);
4397        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4398        assert_eq_m256(r, e);
4399    }
4400
4401    #[simd_test(enable = "avx")]
4402    const unsafe fn test_mm256_maskstore_ps() {
4403        let mut r = _mm256_set1_ps(0.);
4404        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4405        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4406        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4407        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4408        assert_eq_m256(r, e);
4409    }
4410
4411    #[simd_test(enable = "avx")]
4412    const unsafe fn test_mm_maskload_ps() {
4413        let a = &[1.0f32, 2., 3., 4.];
4414        let p = a.as_ptr();
4415        let mask = _mm_setr_epi32(0, !0, 0, !0);
4416        let r = _mm_maskload_ps(black_box(p), mask);
4417        let e = _mm_setr_ps(0., 2., 0., 4.);
4418        assert_eq_m128(r, e);
4419    }
4420
4421    #[simd_test(enable = "avx")]
4422    const unsafe fn test_mm_maskstore_ps() {
4423        let mut r = _mm_set1_ps(0.);
4424        let mask = _mm_setr_epi32(0, !0, 0, !0);
4425        let a = _mm_setr_ps(1., 2., 3., 4.);
4426        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4427        let e = _mm_setr_ps(0., 2., 0., 4.);
4428        assert_eq_m128(r, e);
4429    }
4430
4431    #[simd_test(enable = "avx")]
4432    const fn test_mm256_movehdup_ps() {
4433        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4434        let r = _mm256_movehdup_ps(a);
4435        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4436        assert_eq_m256(r, e);
4437    }
4438
4439    #[simd_test(enable = "avx")]
4440    const fn test_mm256_moveldup_ps() {
4441        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4442        let r = _mm256_moveldup_ps(a);
4443        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4444        assert_eq_m256(r, e);
4445    }
4446
4447    #[simd_test(enable = "avx")]
4448    const fn test_mm256_movedup_pd() {
4449        let a = _mm256_setr_pd(1., 2., 3., 4.);
4450        let r = _mm256_movedup_pd(a);
4451        let e = _mm256_setr_pd(1., 1., 3., 3.);
4452        assert_eq_m256d(r, e);
4453    }
4454
4455    #[simd_test(enable = "avx")]
4456    unsafe fn test_mm256_lddqu_si256() {
4457        #[rustfmt::skip]
4458        let a = _mm256_setr_epi8(
4459            1, 2, 3, 4, 5, 6, 7, 8,
4460            9, 10, 11, 12, 13, 14, 15, 16,
4461            17, 18, 19, 20, 21, 22, 23, 24,
4462            25, 26, 27, 28, 29, 30, 31, 32,
4463        );
4464        let p = ptr::addr_of!(a);
4465        let r = _mm256_lddqu_si256(black_box(p));
4466        #[rustfmt::skip]
4467        let e = _mm256_setr_epi8(
4468            1, 2, 3, 4, 5, 6, 7, 8,
4469            9, 10, 11, 12, 13, 14, 15, 16,
4470            17, 18, 19, 20, 21, 22, 23, 24,
4471            25, 26, 27, 28, 29, 30, 31, 32,
4472        );
4473        assert_eq_m256i(r, e);
4474    }
4475
4476    #[simd_test(enable = "avx")]
4477    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4478    unsafe fn test_mm256_stream_si256() {
4479        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4480        let mut r = _mm256_undefined_si256();
4481        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4482        _mm_sfence();
4483        assert_eq_m256i(r, a);
4484    }
4485
4486    #[simd_test(enable = "avx")]
4487    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4488    unsafe fn test_mm256_stream_pd() {
4489        #[repr(align(32))]
4490        struct Memory {
4491            pub data: [f64; 4],
4492        }
4493        let a = _mm256_set1_pd(7.0);
4494        let mut mem = Memory { data: [-1.0; 4] };
4495
4496        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4497        _mm_sfence();
4498        for i in 0..4 {
4499            assert_eq!(mem.data[i], get_m256d(a, i));
4500        }
4501    }
4502
4503    #[simd_test(enable = "avx")]
4504    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4505    unsafe fn test_mm256_stream_ps() {
4506        #[repr(align(32))]
4507        struct Memory {
4508            pub data: [f32; 8],
4509        }
4510        let a = _mm256_set1_ps(7.0);
4511        let mut mem = Memory { data: [-1.0; 8] };
4512
4513        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4514        _mm_sfence();
4515        for i in 0..8 {
4516            assert_eq!(mem.data[i], get_m256(a, i));
4517        }
4518    }
4519
4520    #[simd_test(enable = "avx")]
4521    fn test_mm256_rcp_ps() {
4522        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4523        let r = _mm256_rcp_ps(a);
4524        #[rustfmt::skip]
4525        let e = _mm256_setr_ps(
4526            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4527            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4528        );
4529        let rel_err = 0.00048828125;
4530        for i in 0..8 {
4531            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4532        }
4533    }
4534
4535    #[simd_test(enable = "avx")]
4536    fn test_mm256_rsqrt_ps() {
4537        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4538        let r = _mm256_rsqrt_ps(a);
4539        #[rustfmt::skip]
4540        let e = _mm256_setr_ps(
4541            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4542            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4543        );
4544        let rel_err = 0.00048828125;
4545        for i in 0..8 {
4546            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4547        }
4548    }
4549
4550    #[simd_test(enable = "avx")]
4551    const fn test_mm256_unpackhi_pd() {
4552        let a = _mm256_setr_pd(1., 2., 3., 4.);
4553        let b = _mm256_setr_pd(5., 6., 7., 8.);
4554        let r = _mm256_unpackhi_pd(a, b);
4555        let e = _mm256_setr_pd(2., 6., 4., 8.);
4556        assert_eq_m256d(r, e);
4557    }
4558
4559    #[simd_test(enable = "avx")]
4560    const fn test_mm256_unpackhi_ps() {
4561        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4562        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4563        let r = _mm256_unpackhi_ps(a, b);
4564        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4565        assert_eq_m256(r, e);
4566    }
4567
4568    #[simd_test(enable = "avx")]
4569    const fn test_mm256_unpacklo_pd() {
4570        let a = _mm256_setr_pd(1., 2., 3., 4.);
4571        let b = _mm256_setr_pd(5., 6., 7., 8.);
4572        let r = _mm256_unpacklo_pd(a, b);
4573        let e = _mm256_setr_pd(1., 5., 3., 7.);
4574        assert_eq_m256d(r, e);
4575    }
4576
4577    #[simd_test(enable = "avx")]
4578    const fn test_mm256_unpacklo_ps() {
4579        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4580        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4581        let r = _mm256_unpacklo_ps(a, b);
4582        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4583        assert_eq_m256(r, e);
4584    }
4585
4586    #[simd_test(enable = "avx")]
4587    const fn test_mm256_testz_si256() {
4588        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4589        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4590        let r = _mm256_testz_si256(a, b);
4591        assert_eq!(r, 0);
4592        let b = _mm256_set1_epi64x(0);
4593        let r = _mm256_testz_si256(a, b);
4594        assert_eq!(r, 1);
4595    }
4596
4597    #[simd_test(enable = "avx")]
4598    const fn test_mm256_testc_si256() {
4599        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4600        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4601        let r = _mm256_testc_si256(a, b);
4602        assert_eq!(r, 0);
4603        let b = _mm256_set1_epi64x(0);
4604        let r = _mm256_testc_si256(a, b);
4605        assert_eq!(r, 1);
4606    }
4607
4608    #[simd_test(enable = "avx")]
4609    fn test_mm256_testnzc_si256() {
4610        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4611        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4612        let r = _mm256_testnzc_si256(a, b);
4613        assert_eq!(r, 1);
4614        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4615        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4616        let r = _mm256_testnzc_si256(a, b);
4617        assert_eq!(r, 0);
4618    }
4619
4620    #[simd_test(enable = "avx")]
4621    fn test_mm256_testz_pd() {
4622        let a = _mm256_setr_pd(1., 2., 3., 4.);
4623        let b = _mm256_setr_pd(5., 6., 7., 8.);
4624        let r = _mm256_testz_pd(a, b);
4625        assert_eq!(r, 1);
4626        let a = _mm256_set1_pd(-1.);
4627        let r = _mm256_testz_pd(a, a);
4628        assert_eq!(r, 0);
4629    }
4630
4631    #[simd_test(enable = "avx")]
4632    fn test_mm256_testc_pd() {
4633        let a = _mm256_setr_pd(1., 2., 3., 4.);
4634        let b = _mm256_setr_pd(5., 6., 7., 8.);
4635        let r = _mm256_testc_pd(a, b);
4636        assert_eq!(r, 1);
4637        let a = _mm256_set1_pd(1.);
4638        let b = _mm256_set1_pd(-1.);
4639        let r = _mm256_testc_pd(a, b);
4640        assert_eq!(r, 0);
4641    }
4642
4643    #[simd_test(enable = "avx")]
4644    fn test_mm256_testnzc_pd() {
4645        let a = _mm256_setr_pd(1., 2., 3., 4.);
4646        let b = _mm256_setr_pd(5., 6., 7., 8.);
4647        let r = _mm256_testnzc_pd(a, b);
4648        assert_eq!(r, 0);
4649        let a = _mm256_setr_pd(1., -1., -1., -1.);
4650        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4651        let r = _mm256_testnzc_pd(a, b);
4652        assert_eq!(r, 1);
4653    }
4654
4655    #[simd_test(enable = "avx")]
4656    const fn test_mm_testz_pd() {
4657        let a = _mm_setr_pd(1., 2.);
4658        let b = _mm_setr_pd(5., 6.);
4659        let r = _mm_testz_pd(a, b);
4660        assert_eq!(r, 1);
4661        let a = _mm_set1_pd(-1.);
4662        let r = _mm_testz_pd(a, a);
4663        assert_eq!(r, 0);
4664    }
4665
4666    #[simd_test(enable = "avx")]
4667    const fn test_mm_testc_pd() {
4668        let a = _mm_setr_pd(1., 2.);
4669        let b = _mm_setr_pd(5., 6.);
4670        let r = _mm_testc_pd(a, b);
4671        assert_eq!(r, 1);
4672        let a = _mm_set1_pd(1.);
4673        let b = _mm_set1_pd(-1.);
4674        let r = _mm_testc_pd(a, b);
4675        assert_eq!(r, 0);
4676    }
4677
4678    #[simd_test(enable = "avx")]
4679    fn test_mm_testnzc_pd() {
4680        let a = _mm_setr_pd(1., 2.);
4681        let b = _mm_setr_pd(5., 6.);
4682        let r = _mm_testnzc_pd(a, b);
4683        assert_eq!(r, 0);
4684        let a = _mm_setr_pd(1., -1.);
4685        let b = _mm_setr_pd(-1., -1.);
4686        let r = _mm_testnzc_pd(a, b);
4687        assert_eq!(r, 1);
4688    }
4689
4690    #[simd_test(enable = "avx")]
4691    fn test_mm256_testz_ps() {
4692        let a = _mm256_set1_ps(1.);
4693        let r = _mm256_testz_ps(a, a);
4694        assert_eq!(r, 1);
4695        let a = _mm256_set1_ps(-1.);
4696        let r = _mm256_testz_ps(a, a);
4697        assert_eq!(r, 0);
4698    }
4699
4700    #[simd_test(enable = "avx")]
4701    fn test_mm256_testc_ps() {
4702        let a = _mm256_set1_ps(1.);
4703        let r = _mm256_testc_ps(a, a);
4704        assert_eq!(r, 1);
4705        let b = _mm256_set1_ps(-1.);
4706        let r = _mm256_testc_ps(a, b);
4707        assert_eq!(r, 0);
4708    }
4709
4710    #[simd_test(enable = "avx")]
4711    fn test_mm256_testnzc_ps() {
4712        let a = _mm256_set1_ps(1.);
4713        let r = _mm256_testnzc_ps(a, a);
4714        assert_eq!(r, 0);
4715        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4716        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4717        let r = _mm256_testnzc_ps(a, b);
4718        assert_eq!(r, 1);
4719    }
4720
4721    #[simd_test(enable = "avx")]
4722    const fn test_mm_testz_ps() {
4723        let a = _mm_set1_ps(1.);
4724        let r = _mm_testz_ps(a, a);
4725        assert_eq!(r, 1);
4726        let a = _mm_set1_ps(-1.);
4727        let r = _mm_testz_ps(a, a);
4728        assert_eq!(r, 0);
4729    }
4730
4731    #[simd_test(enable = "avx")]
4732    const fn test_mm_testc_ps() {
4733        let a = _mm_set1_ps(1.);
4734        let r = _mm_testc_ps(a, a);
4735        assert_eq!(r, 1);
4736        let b = _mm_set1_ps(-1.);
4737        let r = _mm_testc_ps(a, b);
4738        assert_eq!(r, 0);
4739    }
4740
4741    #[simd_test(enable = "avx")]
4742    fn test_mm_testnzc_ps() {
4743        let a = _mm_set1_ps(1.);
4744        let r = _mm_testnzc_ps(a, a);
4745        assert_eq!(r, 0);
4746        let a = _mm_setr_ps(1., -1., -1., -1.);
4747        let b = _mm_setr_ps(-1., -1., 1., 1.);
4748        let r = _mm_testnzc_ps(a, b);
4749        assert_eq!(r, 1);
4750    }
4751
4752    #[simd_test(enable = "avx")]
4753    const fn test_mm256_movemask_pd() {
4754        let a = _mm256_setr_pd(1., -2., 3., -4.);
4755        let r = _mm256_movemask_pd(a);
4756        assert_eq!(r, 0xA);
4757    }
4758
4759    #[simd_test(enable = "avx")]
4760    const fn test_mm256_movemask_ps() {
4761        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4762        let r = _mm256_movemask_ps(a);
4763        assert_eq!(r, 0xAA);
4764    }
4765
4766    #[simd_test(enable = "avx")]
4767    const fn test_mm256_setzero_pd() {
4768        let r = _mm256_setzero_pd();
4769        assert_eq_m256d(r, _mm256_set1_pd(0.));
4770    }
4771
4772    #[simd_test(enable = "avx")]
4773    const fn test_mm256_setzero_ps() {
4774        let r = _mm256_setzero_ps();
4775        assert_eq_m256(r, _mm256_set1_ps(0.));
4776    }
4777
4778    #[simd_test(enable = "avx")]
4779    const fn test_mm256_setzero_si256() {
4780        let r = _mm256_setzero_si256();
4781        assert_eq_m256i(r, _mm256_set1_epi8(0));
4782    }
4783
4784    #[simd_test(enable = "avx")]
4785    const fn test_mm256_set_pd() {
4786        let r = _mm256_set_pd(1., 2., 3., 4.);
4787        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4788    }
4789
4790    #[simd_test(enable = "avx")]
4791    const fn test_mm256_set_ps() {
4792        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4793        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4794    }
4795
4796    #[simd_test(enable = "avx")]
4797    const fn test_mm256_set_epi8() {
4798        #[rustfmt::skip]
4799        let r = _mm256_set_epi8(
4800            1, 2, 3, 4, 5, 6, 7, 8,
4801            9, 10, 11, 12, 13, 14, 15, 16,
4802            17, 18, 19, 20, 21, 22, 23, 24,
4803            25, 26, 27, 28, 29, 30, 31, 32,
4804        );
4805        #[rustfmt::skip]
4806        let e = _mm256_setr_epi8(
4807            32, 31, 30, 29, 28, 27, 26, 25,
4808            24, 23, 22, 21, 20, 19, 18, 17,
4809            16, 15, 14, 13, 12, 11, 10, 9,
4810            8, 7, 6, 5, 4, 3, 2, 1
4811        );
4812        assert_eq_m256i(r, e);
4813    }
4814
4815    #[simd_test(enable = "avx")]
4816    const fn test_mm256_set_epi16() {
4817        #[rustfmt::skip]
4818        let r = _mm256_set_epi16(
4819            1, 2, 3, 4, 5, 6, 7, 8,
4820            9, 10, 11, 12, 13, 14, 15, 16,
4821        );
4822        #[rustfmt::skip]
4823        let e = _mm256_setr_epi16(
4824            16, 15, 14, 13, 12, 11, 10, 9, 8,
4825            7, 6, 5, 4, 3, 2, 1,
4826        );
4827        assert_eq_m256i(r, e);
4828    }
4829
4830    #[simd_test(enable = "avx")]
4831    const fn test_mm256_set_epi32() {
4832        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4833        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4834    }
4835
4836    #[simd_test(enable = "avx")]
4837    const fn test_mm256_set_epi64x() {
4838        let r = _mm256_set_epi64x(1, 2, 3, 4);
4839        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4840    }
4841
4842    #[simd_test(enable = "avx")]
4843    const fn test_mm256_setr_pd() {
4844        let r = _mm256_setr_pd(1., 2., 3., 4.);
4845        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4846    }
4847
4848    #[simd_test(enable = "avx")]
4849    const fn test_mm256_setr_ps() {
4850        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4851        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4852    }
4853
4854    #[simd_test(enable = "avx")]
4855    const fn test_mm256_setr_epi8() {
4856        #[rustfmt::skip]
4857        let r = _mm256_setr_epi8(
4858            1, 2, 3, 4, 5, 6, 7, 8,
4859            9, 10, 11, 12, 13, 14, 15, 16,
4860            17, 18, 19, 20, 21, 22, 23, 24,
4861            25, 26, 27, 28, 29, 30, 31, 32,
4862        );
4863        #[rustfmt::skip]
4864        let e = _mm256_setr_epi8(
4865            1, 2, 3, 4, 5, 6, 7, 8,
4866            9, 10, 11, 12, 13, 14, 15, 16,
4867            17, 18, 19, 20, 21, 22, 23, 24,
4868            25, 26, 27, 28, 29, 30, 31, 32
4869        );
4870
4871        assert_eq_m256i(r, e);
4872    }
4873
4874    #[simd_test(enable = "avx")]
4875    const fn test_mm256_setr_epi16() {
4876        #[rustfmt::skip]
4877        let r = _mm256_setr_epi16(
4878            1, 2, 3, 4, 5, 6, 7, 8,
4879            9, 10, 11, 12, 13, 14, 15, 16,
4880        );
4881        #[rustfmt::skip]
4882        let e = _mm256_setr_epi16(
4883            1, 2, 3, 4, 5, 6, 7, 8,
4884            9, 10, 11, 12, 13, 14, 15, 16,
4885        );
4886        assert_eq_m256i(r, e);
4887    }
4888
4889    #[simd_test(enable = "avx")]
4890    const fn test_mm256_setr_epi32() {
4891        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4892        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4893    }
4894
4895    #[simd_test(enable = "avx")]
4896    const fn test_mm256_setr_epi64x() {
4897        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4898        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4899    }
4900
4901    #[simd_test(enable = "avx")]
4902    const fn test_mm256_set1_pd() {
4903        let r = _mm256_set1_pd(1.);
4904        assert_eq_m256d(r, _mm256_set1_pd(1.));
4905    }
4906
4907    #[simd_test(enable = "avx")]
4908    const fn test_mm256_set1_ps() {
4909        let r = _mm256_set1_ps(1.);
4910        assert_eq_m256(r, _mm256_set1_ps(1.));
4911    }
4912
4913    #[simd_test(enable = "avx")]
4914    const fn test_mm256_set1_epi8() {
4915        let r = _mm256_set1_epi8(1);
4916        assert_eq_m256i(r, _mm256_set1_epi8(1));
4917    }
4918
4919    #[simd_test(enable = "avx")]
4920    const fn test_mm256_set1_epi16() {
4921        let r = _mm256_set1_epi16(1);
4922        assert_eq_m256i(r, _mm256_set1_epi16(1));
4923    }
4924
4925    #[simd_test(enable = "avx")]
4926    const fn test_mm256_set1_epi32() {
4927        let r = _mm256_set1_epi32(1);
4928        assert_eq_m256i(r, _mm256_set1_epi32(1));
4929    }
4930
4931    #[simd_test(enable = "avx")]
4932    const fn test_mm256_set1_epi64x() {
4933        let r = _mm256_set1_epi64x(1);
4934        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4935    }
4936
4937    #[simd_test(enable = "avx")]
4938    const fn test_mm256_castpd_ps() {
4939        let a = _mm256_setr_pd(1., 2., 3., 4.);
4940        let r = _mm256_castpd_ps(a);
4941        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4942        assert_eq_m256(r, e);
4943    }
4944
4945    #[simd_test(enable = "avx")]
4946    const fn test_mm256_castps_pd() {
4947        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4948        let r = _mm256_castps_pd(a);
4949        let e = _mm256_setr_pd(1., 2., 3., 4.);
4950        assert_eq_m256d(r, e);
4951    }
4952
4953    #[simd_test(enable = "avx")]
4954    const fn test_mm256_castps_si256() {
4955        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4956        let r = _mm256_castps_si256(a);
4957        #[rustfmt::skip]
4958        let e = _mm256_setr_epi8(
4959            0, 0, -128, 63, 0, 0, 0, 64,
4960            0, 0, 64, 64, 0, 0, -128, 64,
4961            0, 0, -96, 64, 0, 0, -64, 64,
4962            0, 0, -32, 64, 0, 0, 0, 65,
4963        );
4964        assert_eq_m256i(r, e);
4965    }
4966
4967    #[simd_test(enable = "avx")]
4968    const fn test_mm256_castsi256_ps() {
4969        #[rustfmt::skip]
4970        let a = _mm256_setr_epi8(
4971            0, 0, -128, 63, 0, 0, 0, 64,
4972            0, 0, 64, 64, 0, 0, -128, 64,
4973            0, 0, -96, 64, 0, 0, -64, 64,
4974            0, 0, -32, 64, 0, 0, 0, 65,
4975        );
4976        let r = _mm256_castsi256_ps(a);
4977        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4978        assert_eq_m256(r, e);
4979    }
4980
4981    #[simd_test(enable = "avx")]
4982    const fn test_mm256_castpd_si256() {
4983        let a = _mm256_setr_pd(1., 2., 3., 4.);
4984        let r = _mm256_castpd_si256(a);
4985        assert_eq_m256d(unsafe { transmute(r) }, a);
4986    }
4987
4988    #[simd_test(enable = "avx")]
4989    const fn test_mm256_castsi256_pd() {
4990        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4991        let r = _mm256_castsi256_pd(a);
4992        assert_eq_m256d(r, unsafe { transmute(a) });
4993    }
4994
4995    #[simd_test(enable = "avx")]
4996    const fn test_mm256_castps256_ps128() {
4997        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4998        let r = _mm256_castps256_ps128(a);
4999        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
5000    }
5001
5002    #[simd_test(enable = "avx")]
5003    const fn test_mm256_castpd256_pd128() {
5004        let a = _mm256_setr_pd(1., 2., 3., 4.);
5005        let r = _mm256_castpd256_pd128(a);
5006        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
5007    }
5008
5009    #[simd_test(enable = "avx")]
5010    const fn test_mm256_castsi256_si128() {
5011        let a = _mm256_setr_epi64x(1, 2, 3, 4);
5012        let r = _mm256_castsi256_si128(a);
5013        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5014    }
5015
5016    #[simd_test(enable = "avx")]
5017    const fn test_mm256_castps128_ps256() {
5018        let a = _mm_setr_ps(1., 2., 3., 4.);
5019        let r = _mm256_castps128_ps256(a);
5020        assert_eq_m128(_mm256_castps256_ps128(r), a);
5021    }
5022
5023    #[simd_test(enable = "avx")]
5024    const fn test_mm256_castpd128_pd256() {
5025        let a = _mm_setr_pd(1., 2.);
5026        let r = _mm256_castpd128_pd256(a);
5027        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
5028    }
5029
5030    #[simd_test(enable = "avx")]
5031    const fn test_mm256_castsi128_si256() {
5032        let a = _mm_setr_epi32(1, 2, 3, 4);
5033        let r = _mm256_castsi128_si256(a);
5034        assert_eq_m128i(_mm256_castsi256_si128(r), a);
5035    }
5036
5037    #[simd_test(enable = "avx")]
5038    const fn test_mm256_zextps128_ps256() {
5039        let a = _mm_setr_ps(1., 2., 3., 4.);
5040        let r = _mm256_zextps128_ps256(a);
5041        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
5042        assert_eq_m256(r, e);
5043    }
5044
5045    #[simd_test(enable = "avx")]
5046    const fn test_mm256_zextsi128_si256() {
5047        let a = _mm_setr_epi64x(1, 2);
5048        let r = _mm256_zextsi128_si256(a);
5049        let e = _mm256_setr_epi64x(1, 2, 0, 0);
5050        assert_eq_m256i(r, e);
5051    }
5052
5053    #[simd_test(enable = "avx")]
5054    const fn test_mm256_zextpd128_pd256() {
5055        let a = _mm_setr_pd(1., 2.);
5056        let r = _mm256_zextpd128_pd256(a);
5057        let e = _mm256_setr_pd(1., 2., 0., 0.);
5058        assert_eq_m256d(r, e);
5059    }
5060
5061    #[simd_test(enable = "avx")]
5062    const fn test_mm256_set_m128() {
5063        let hi = _mm_setr_ps(5., 6., 7., 8.);
5064        let lo = _mm_setr_ps(1., 2., 3., 4.);
5065        let r = _mm256_set_m128(hi, lo);
5066        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5067        assert_eq_m256(r, e);
5068    }
5069
5070    #[simd_test(enable = "avx")]
5071    const fn test_mm256_set_m128d() {
5072        let hi = _mm_setr_pd(3., 4.);
5073        let lo = _mm_setr_pd(1., 2.);
5074        let r = _mm256_set_m128d(hi, lo);
5075        let e = _mm256_setr_pd(1., 2., 3., 4.);
5076        assert_eq_m256d(r, e);
5077    }
5078
5079    #[simd_test(enable = "avx")]
5080    const fn test_mm256_set_m128i() {
5081        #[rustfmt::skip]
5082        let hi = _mm_setr_epi8(
5083            17, 18, 19, 20,
5084            21, 22, 23, 24,
5085            25, 26, 27, 28,
5086            29, 30, 31, 32,
5087        );
5088        #[rustfmt::skip]
5089        let lo = _mm_setr_epi8(
5090            1, 2, 3, 4,
5091            5, 6, 7, 8,
5092            9, 10, 11, 12,
5093            13, 14, 15, 16,
5094        );
5095        let r = _mm256_set_m128i(hi, lo);
5096        #[rustfmt::skip]
5097        let e = _mm256_setr_epi8(
5098            1, 2, 3, 4, 5, 6, 7, 8,
5099            9, 10, 11, 12, 13, 14, 15, 16,
5100            17, 18, 19, 20, 21, 22, 23, 24,
5101            25, 26, 27, 28, 29, 30, 31, 32,
5102        );
5103        assert_eq_m256i(r, e);
5104    }
5105
5106    #[simd_test(enable = "avx")]
5107    const fn test_mm256_setr_m128() {
5108        let lo = _mm_setr_ps(1., 2., 3., 4.);
5109        let hi = _mm_setr_ps(5., 6., 7., 8.);
5110        let r = _mm256_setr_m128(lo, hi);
5111        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5112        assert_eq_m256(r, e);
5113    }
5114
5115    #[simd_test(enable = "avx")]
5116    const fn test_mm256_setr_m128d() {
5117        let lo = _mm_setr_pd(1., 2.);
5118        let hi = _mm_setr_pd(3., 4.);
5119        let r = _mm256_setr_m128d(lo, hi);
5120        let e = _mm256_setr_pd(1., 2., 3., 4.);
5121        assert_eq_m256d(r, e);
5122    }
5123
5124    #[simd_test(enable = "avx")]
5125    const fn test_mm256_setr_m128i() {
5126        #[rustfmt::skip]
5127        let lo = _mm_setr_epi8(
5128            1, 2, 3, 4,
5129            5, 6, 7, 8,
5130            9, 10, 11, 12,
5131            13, 14, 15, 16,
5132        );
5133        #[rustfmt::skip]
5134        let hi = _mm_setr_epi8(
5135            17, 18, 19, 20, 21, 22, 23, 24,
5136            25, 26, 27, 28, 29, 30, 31, 32,
5137        );
5138        let r = _mm256_setr_m128i(lo, hi);
5139        #[rustfmt::skip]
5140        let e = _mm256_setr_epi8(
5141            1, 2, 3, 4, 5, 6, 7, 8,
5142            9, 10, 11, 12, 13, 14, 15, 16,
5143            17, 18, 19, 20, 21, 22, 23, 24,
5144            25, 26, 27, 28, 29, 30, 31, 32,
5145        );
5146        assert_eq_m256i(r, e);
5147    }
5148
5149    #[simd_test(enable = "avx")]
5150    const unsafe fn test_mm256_loadu2_m128() {
5151        let hi = &[5., 6., 7., 8.];
5152        let hiaddr = hi.as_ptr();
5153        let lo = &[1., 2., 3., 4.];
5154        let loaddr = lo.as_ptr();
5155        let r = _mm256_loadu2_m128(hiaddr, loaddr);
5156        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5157        assert_eq_m256(r, e);
5158    }
5159
5160    #[simd_test(enable = "avx")]
5161    const unsafe fn test_mm256_loadu2_m128d() {
5162        let hi = &[3., 4.];
5163        let hiaddr = hi.as_ptr();
5164        let lo = &[1., 2.];
5165        let loaddr = lo.as_ptr();
5166        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
5167        let e = _mm256_setr_pd(1., 2., 3., 4.);
5168        assert_eq_m256d(r, e);
5169    }
5170
5171    #[simd_test(enable = "avx")]
5172    const unsafe fn test_mm256_loadu2_m128i() {
5173        #[rustfmt::skip]
5174        let hi = _mm_setr_epi8(
5175            17, 18, 19, 20, 21, 22, 23, 24,
5176            25, 26, 27, 28, 29, 30, 31, 32,
5177        );
5178        #[rustfmt::skip]
5179        let lo = _mm_setr_epi8(
5180            1, 2, 3, 4, 5, 6, 7, 8,
5181            9, 10, 11, 12, 13, 14, 15, 16,
5182        );
5183        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
5184        #[rustfmt::skip]
5185        let e = _mm256_setr_epi8(
5186            1, 2, 3, 4, 5, 6, 7, 8,
5187            9, 10, 11, 12, 13, 14, 15, 16,
5188            17, 18, 19, 20, 21, 22, 23, 24,
5189            25, 26, 27, 28, 29, 30, 31, 32,
5190        );
5191        assert_eq_m256i(r, e);
5192    }
5193
5194    #[simd_test(enable = "avx")]
5195    const unsafe fn test_mm256_storeu2_m128() {
5196        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5197        let mut hi = _mm_undefined_ps();
5198        let mut lo = _mm_undefined_ps();
5199        _mm256_storeu2_m128(
5200            ptr::addr_of_mut!(hi) as *mut f32,
5201            ptr::addr_of_mut!(lo) as *mut f32,
5202            a,
5203        );
5204        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5205        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5206    }
5207
5208    #[simd_test(enable = "avx")]
5209    const unsafe fn test_mm256_storeu2_m128d() {
5210        let a = _mm256_setr_pd(1., 2., 3., 4.);
5211        let mut hi = _mm_undefined_pd();
5212        let mut lo = _mm_undefined_pd();
5213        _mm256_storeu2_m128d(
5214            ptr::addr_of_mut!(hi) as *mut f64,
5215            ptr::addr_of_mut!(lo) as *mut f64,
5216            a,
5217        );
5218        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5219        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5220    }
5221
5222    #[simd_test(enable = "avx")]
5223    const unsafe fn test_mm256_storeu2_m128i() {
5224        #[rustfmt::skip]
5225        let a = _mm256_setr_epi8(
5226            1, 2, 3, 4, 5, 6, 7, 8,
5227            9, 10, 11, 12, 13, 14, 15, 16,
5228            17, 18, 19, 20, 21, 22, 23, 24,
5229            25, 26, 27, 28, 29, 30, 31, 32,
5230        );
5231        let mut hi = _mm_undefined_si128();
5232        let mut lo = _mm_undefined_si128();
5233        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5234        #[rustfmt::skip]
5235        let e_hi = _mm_setr_epi8(
5236            17, 18, 19, 20, 21, 22, 23, 24,
5237            25, 26, 27, 28, 29, 30, 31, 32
5238        );
5239        #[rustfmt::skip]
5240        let e_lo = _mm_setr_epi8(
5241            1, 2, 3, 4, 5, 6, 7, 8,
5242            9, 10, 11, 12, 13, 14, 15, 16
5243        );
5244
5245        assert_eq_m128i(hi, e_hi);
5246        assert_eq_m128i(lo, e_lo);
5247    }
5248
5249    #[simd_test(enable = "avx")]
5250    const fn test_mm256_cvtss_f32() {
5251        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5252        let r = _mm256_cvtss_f32(a);
5253        assert_eq!(r, 1.);
5254    }
5255}