core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34    unsafe { simd_add(a, b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46    unsafe { simd_add(a, b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59    unsafe {
60        let a: u64x4 = transmute(a);
61        let b: u64x4 = transmute(b);
62        transmute(simd_and(a, b))
63    }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75    unsafe {
76        let a: u32x8 = transmute(a);
77        let b: u32x8 = transmute(b);
78        transmute(simd_and(a, b))
79    }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92    unsafe {
93        let a: u64x4 = transmute(a);
94        let b: u64x4 = transmute(b);
95        transmute(simd_or(a, b))
96    }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108    unsafe {
109        let a: u32x8 = transmute(a);
110        let b: u32x8 = transmute(b);
111        transmute(simd_or(a, b))
112    }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125    static_assert_uimm_bits!(MASK, 8);
126    unsafe {
127        simd_shuffle!(
128            a,
129            b,
130            [
131                MASK as u32 & 0b1,
132                ((MASK as u32 >> 1) & 0b1) + 4,
133                ((MASK as u32 >> 2) & 0b1) + 2,
134                ((MASK as u32 >> 3) & 0b1) + 6,
135            ],
136        )
137    }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150    static_assert_uimm_bits!(MASK, 8);
151    unsafe {
152        simd_shuffle!(
153            a,
154            b,
155            [
156                MASK as u32 & 0b11,
157                (MASK as u32 >> 2) & 0b11,
158                ((MASK as u32 >> 4) & 0b11) + 8,
159                ((MASK as u32 >> 6) & 0b11) + 8,
160                (MASK as u32 & 0b11) + 4,
161                ((MASK as u32 >> 2) & 0b11) + 4,
162                ((MASK as u32 >> 4) & 0b11) + 12,
163                ((MASK as u32 >> 6) & 0b11) + 12,
164            ],
165        )
166    }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178    unsafe {
179        let a: u64x4 = transmute(a);
180        let b: u64x4 = transmute(b);
181        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
182    }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195    unsafe {
196        let a: u32x8 = transmute(a);
197        let b: u32x8 = transmute(b);
198        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
199    }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211    unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223    unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235    unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247    unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259    unsafe { simd_mul(a, b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271    unsafe { simd_mul(a, b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283    unsafe {
284        let a = a.as_f64x4();
285        let b = b.as_f64x4();
286        let add = simd_add(a, b);
287        let sub = simd_sub(a, b);
288        simd_shuffle!(add, sub, [4, 1, 6, 3])
289    }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301    unsafe {
302        let a = a.as_f32x8();
303        let b = b.as_f32x8();
304        let add = simd_add(a, b);
305        let sub = simd_sub(a, b);
306        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307    }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319    unsafe { simd_sub(a, b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331    unsafe { simd_sub(a, b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343    unsafe { simd_div(a, b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355    unsafe { simd_div(a, b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377    static_assert_uimm_bits!(ROUNDING, 4);
378    unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390    unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402    unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424    static_assert_uimm_bits!(ROUNDING, 4);
425    unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437    unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449    unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461    unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473    unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489    static_assert_uimm_bits!(IMM4, 4);
490    unsafe {
491        simd_shuffle!(
492            a,
493            b,
494            [
495                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499            ],
500        )
501    }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514    static_assert_uimm_bits!(IMM8, 8);
515    unsafe {
516        simd_shuffle!(
517            a,
518            b,
519            [
520                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528            ],
529        )
530    }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542    unsafe {
543        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
544        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
545    }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557    unsafe {
558        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
559        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
560    }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566///  using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575    static_assert_uimm_bits!(IMM8, 8);
576    unsafe { vdpps(a, b, IMM8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590    unsafe { vhaddpd(a, b) }
591}
592
593/// Horizontal addition of adjacent pairs in the two packed vectors
594/// of 8 32-bit floating points `a` and `b`.
595/// In the result, sums of elements from `a` are returned in locations of
596/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
597/// 2, 3, 6, 7.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
600#[inline]
601#[target_feature(enable = "avx")]
602#[cfg_attr(test, assert_instr(vhaddps))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
605    unsafe { vhaddps(a, b) }
606}
607
608/// Horizontal subtraction of adjacent pairs in the two packed vectors
609/// of 4 64-bit floating points `a` and `b`.
610/// In the result, sums of elements from `a` are returned in even locations,
611/// while sums of elements from `b` are returned in odd locations.
612///
613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
614#[inline]
615#[target_feature(enable = "avx")]
616#[cfg_attr(test, assert_instr(vhsubpd))]
617#[stable(feature = "simd_x86", since = "1.27.0")]
618pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
619    unsafe { vhsubpd(a, b) }
620}
621
622/// Horizontal subtraction of adjacent pairs in the two packed vectors
623/// of 8 32-bit floating points `a` and `b`.
624/// In the result, sums of elements from `a` are returned in locations of
625/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
626/// 2, 3, 6, 7.
627///
628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
629#[inline]
630#[target_feature(enable = "avx")]
631#[cfg_attr(test, assert_instr(vhsubps))]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
634    unsafe { vhsubps(a, b) }
635}
636
637/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
638/// elements in `a` and `b`.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vxorp))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
646    unsafe {
647        let a: u64x4 = transmute(a);
648        let b: u64x4 = transmute(b);
649        transmute(simd_xor(a, b))
650    }
651}
652
653/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorps))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
662    unsafe {
663        let a: u32x8 = transmute(a);
664        let b: u32x8 = transmute(b);
665        transmute(simd_xor(a, b))
666    }
667}
668
669/// Equal (ordered, non-signaling)
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub const _CMP_EQ_OQ: i32 = 0x00;
672/// Less-than (ordered, signaling)
673#[stable(feature = "simd_x86", since = "1.27.0")]
674pub const _CMP_LT_OS: i32 = 0x01;
675/// Less-than-or-equal (ordered, signaling)
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub const _CMP_LE_OS: i32 = 0x02;
678/// Unordered (non-signaling)
679#[stable(feature = "simd_x86", since = "1.27.0")]
680pub const _CMP_UNORD_Q: i32 = 0x03;
681/// Not-equal (unordered, non-signaling)
682#[stable(feature = "simd_x86", since = "1.27.0")]
683pub const _CMP_NEQ_UQ: i32 = 0x04;
684/// Not-less-than (unordered, signaling)
685#[stable(feature = "simd_x86", since = "1.27.0")]
686pub const _CMP_NLT_US: i32 = 0x05;
687/// Not-less-than-or-equal (unordered, signaling)
688#[stable(feature = "simd_x86", since = "1.27.0")]
689pub const _CMP_NLE_US: i32 = 0x06;
690/// Ordered (non-signaling)
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub const _CMP_ORD_Q: i32 = 0x07;
693/// Equal (unordered, non-signaling)
694#[stable(feature = "simd_x86", since = "1.27.0")]
695pub const _CMP_EQ_UQ: i32 = 0x08;
696/// Not-greater-than-or-equal (unordered, signaling)
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub const _CMP_NGE_US: i32 = 0x09;
699/// Not-greater-than (unordered, signaling)
700#[stable(feature = "simd_x86", since = "1.27.0")]
701pub const _CMP_NGT_US: i32 = 0x0a;
702/// False (ordered, non-signaling)
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub const _CMP_FALSE_OQ: i32 = 0x0b;
705/// Not-equal (ordered, non-signaling)
706#[stable(feature = "simd_x86", since = "1.27.0")]
707pub const _CMP_NEQ_OQ: i32 = 0x0c;
708/// Greater-than-or-equal (ordered, signaling)
709#[stable(feature = "simd_x86", since = "1.27.0")]
710pub const _CMP_GE_OS: i32 = 0x0d;
711/// Greater-than (ordered, signaling)
712#[stable(feature = "simd_x86", since = "1.27.0")]
713pub const _CMP_GT_OS: i32 = 0x0e;
714/// True (unordered, non-signaling)
715#[stable(feature = "simd_x86", since = "1.27.0")]
716pub const _CMP_TRUE_UQ: i32 = 0x0f;
717/// Equal (ordered, signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OS: i32 = 0x10;
720/// Less-than (ordered, non-signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OQ: i32 = 0x11;
723/// Less-than-or-equal (ordered, non-signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OQ: i32 = 0x12;
726/// Unordered (signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_S: i32 = 0x13;
729/// Not-equal (unordered, signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_US: i32 = 0x14;
732/// Not-less-than (unordered, non-signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_UQ: i32 = 0x15;
735/// Not-less-than-or-equal (unordered, non-signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_UQ: i32 = 0x16;
738/// Ordered (signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_S: i32 = 0x17;
741/// Equal (unordered, signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_US: i32 = 0x18;
744/// Not-greater-than-or-equal (unordered, non-signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_UQ: i32 = 0x19;
747/// Not-greater-than (unordered, non-signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_UQ: i32 = 0x1a;
750/// False (ordered, signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OS: i32 = 0x1b;
753/// Not-equal (ordered, signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OS: i32 = 0x1c;
756/// Greater-than-or-equal (ordered, non-signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OQ: i32 = 0x1d;
759/// Greater-than (ordered, non-signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OQ: i32 = 0x1e;
762/// True (unordered, signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_US: i32 = 0x1f;
765
766/// Compares packed double-precision (64-bit) floating-point
767/// elements in `a` and `b` based on the comparison operand
768/// specified by `IMM5`.
769///
770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
771#[inline]
772#[target_feature(enable = "avx")]
773#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
774#[rustc_legacy_const_generics(2)]
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
777    static_assert_uimm_bits!(IMM5, 5);
778    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
779}
780
781/// Compares packed double-precision (64-bit) floating-point
782/// elements in `a` and `b` based on the comparison operand
783/// specified by `IMM5`.
784///
785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
786#[inline]
787#[target_feature(enable = "avx")]
788#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
789#[rustc_legacy_const_generics(2)]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
792    static_assert_uimm_bits!(IMM5, 5);
793    unsafe { vcmppd256(a, b, IMM5 as u8) }
794}
795
796/// Compares packed single-precision (32-bit) floating-point
797/// elements in `a` and `b` based on the comparison operand
798/// specified by `IMM5`.
799///
800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
801#[inline]
802#[target_feature(enable = "avx")]
803#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
804#[rustc_legacy_const_generics(2)]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
807    static_assert_uimm_bits!(IMM5, 5);
808    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
809}
810
811/// Compares packed single-precision (32-bit) floating-point
812/// elements in `a` and `b` based on the comparison operand
813/// specified by `IMM5`.
814///
815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
816#[inline]
817#[target_feature(enable = "avx")]
818#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
819#[rustc_legacy_const_generics(2)]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
822    static_assert_uimm_bits!(IMM5, 5);
823    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
824}
825
826/// Compares the lower double-precision (64-bit) floating-point element in
827/// `a` and `b` based on the comparison operand specified by `IMM5`,
828/// store the result in the lower element of returned vector,
829/// and copies the upper element from `a` to the upper element of returned
830/// vector.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
833#[inline]
834#[target_feature(enable = "avx")]
835#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
836#[rustc_legacy_const_generics(2)]
837#[stable(feature = "simd_x86", since = "1.27.0")]
838pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
839    static_assert_uimm_bits!(IMM5, 5);
840    unsafe { vcmpsd(a, b, IMM5 as i8) }
841}
842
843/// Compares the lower single-precision (32-bit) floating-point element in
844/// `a` and `b` based on the comparison operand specified by `IMM5`,
845/// store the result in the lower element of returned vector,
846/// and copies the upper 3 packed elements from `a` to the upper elements of
847/// returned vector.
848///
849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
850#[inline]
851#[target_feature(enable = "avx")]
852#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
853#[rustc_legacy_const_generics(2)]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
856    static_assert_uimm_bits!(IMM5, 5);
857    unsafe { vcmpss(a, b, IMM5 as i8) }
858}
859
860/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
861/// floating-point elements.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcvtdq2pd))]
867#[stable(feature = "simd_x86", since = "1.27.0")]
868pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
869    unsafe { simd_cast(a.as_i32x4()) }
870}
871
872/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
873/// floating-point elements.
874///
875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
876#[inline]
877#[target_feature(enable = "avx")]
878#[cfg_attr(test, assert_instr(vcvtdq2ps))]
879#[stable(feature = "simd_x86", since = "1.27.0")]
880pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
881    unsafe { simd_cast(a.as_i32x8()) }
882}
883
884/// Converts packed double-precision (64-bit) floating-point elements in `a`
885/// to packed single-precision (32-bit) floating-point elements.
886///
887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
888#[inline]
889#[target_feature(enable = "avx")]
890#[cfg_attr(test, assert_instr(vcvtpd2ps))]
891#[stable(feature = "simd_x86", since = "1.27.0")]
892pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
893    unsafe { simd_cast(a) }
894}
895
896/// Converts packed single-precision (32-bit) floating-point elements in `a`
897/// to packed 32-bit integers.
898///
899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
900#[inline]
901#[target_feature(enable = "avx")]
902#[cfg_attr(test, assert_instr(vcvtps2dq))]
903#[stable(feature = "simd_x86", since = "1.27.0")]
904pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
905    unsafe { transmute(vcvtps2dq(a)) }
906}
907
908/// Converts packed single-precision (32-bit) floating-point elements in `a`
909/// to packed double-precision (64-bit) floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtps2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
917    unsafe { simd_cast(a) }
918}
919
920/// Returns the first element of the input vector of `[4 x double]`.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
923#[inline]
924#[target_feature(enable = "avx")]
925//#[cfg_attr(test, assert_instr(movsd))] FIXME
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
928    unsafe { simd_extract!(a, 0) }
929}
930
931/// Converts packed double-precision (64-bit) floating-point elements in `a`
932/// to packed 32-bit integers with truncation.
933///
934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
935#[inline]
936#[target_feature(enable = "avx")]
937#[cfg_attr(test, assert_instr(vcvttpd2dq))]
938#[stable(feature = "simd_x86", since = "1.27.0")]
939pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
940    unsafe { transmute(vcvttpd2dq(a)) }
941}
942
943/// Converts packed double-precision (64-bit) floating-point elements in `a`
944/// to packed 32-bit integers.
945///
946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
947#[inline]
948#[target_feature(enable = "avx")]
949#[cfg_attr(test, assert_instr(vcvtpd2dq))]
950#[stable(feature = "simd_x86", since = "1.27.0")]
951pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
952    unsafe { transmute(vcvtpd2dq(a)) }
953}
954
955/// Converts packed single-precision (32-bit) floating-point elements in `a`
956/// to packed 32-bit integers with truncation.
957///
958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
959#[inline]
960#[target_feature(enable = "avx")]
961#[cfg_attr(test, assert_instr(vcvttps2dq))]
962#[stable(feature = "simd_x86", since = "1.27.0")]
963pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
964    unsafe { transmute(vcvttps2dq(a)) }
965}
966
967/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
968/// floating-point elements) from `a`, selected with `imm8`.
969///
970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
971#[inline]
972#[target_feature(enable = "avx")]
973#[cfg_attr(
974    all(test, not(target_env = "msvc")),
975    assert_instr(vextractf128, IMM1 = 1)
976)]
977#[rustc_legacy_const_generics(1)]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
980    static_assert_uimm_bits!(IMM1, 1);
981    unsafe {
982        simd_shuffle!(
983            a,
984            _mm256_undefined_ps(),
985            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
986        )
987    }
988}
989
990/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
991/// floating-point elements) from `a`, selected with `imm8`.
992///
993/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
994#[inline]
995#[target_feature(enable = "avx")]
996#[cfg_attr(
997    all(test, not(target_env = "msvc")),
998    assert_instr(vextractf128, IMM1 = 1)
999)]
1000#[rustc_legacy_const_generics(1)]
1001#[stable(feature = "simd_x86", since = "1.27.0")]
1002pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1003    static_assert_uimm_bits!(IMM1, 1);
1004    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1005}
1006
1007/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1008///
1009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1010#[inline]
1011#[target_feature(enable = "avx")]
1012#[cfg_attr(
1013    all(test, not(target_env = "msvc")),
1014    assert_instr(vextractf128, IMM1 = 1)
1015)]
1016#[rustc_legacy_const_generics(1)]
1017#[stable(feature = "simd_x86", since = "1.27.0")]
1018pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1019    static_assert_uimm_bits!(IMM1, 1);
1020    unsafe {
1021        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1022        transmute(dst)
1023    }
1024}
1025
1026/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1029#[inline]
1030#[target_feature(enable = "avx")]
1031// This intrinsic has no corresponding instruction.
1032#[rustc_legacy_const_generics(1)]
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1035    static_assert_uimm_bits!(INDEX, 3);
1036    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1037}
1038
1039/// Returns the first element of the input vector of `[8 x i32]`.
1040///
1041/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1042#[inline]
1043#[target_feature(enable = "avx")]
1044#[stable(feature = "simd_x86", since = "1.27.0")]
1045pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1046    unsafe { simd_extract!(a.as_i32x8(), 0) }
1047}
1048
1049/// Zeroes the contents of all XMM or YMM registers.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1052#[inline]
1053#[target_feature(enable = "avx")]
1054#[cfg_attr(test, assert_instr(vzeroall))]
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056pub fn _mm256_zeroall() {
1057    unsafe { vzeroall() }
1058}
1059
1060/// Zeroes the upper 128 bits of all YMM registers;
1061/// the lower 128-bits of the registers are unmodified.
1062///
1063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1064#[inline]
1065#[target_feature(enable = "avx")]
1066#[cfg_attr(test, assert_instr(vzeroupper))]
1067#[stable(feature = "simd_x86", since = "1.27.0")]
1068pub fn _mm256_zeroupper() {
1069    unsafe { vzeroupper() }
1070}
1071
1072/// Shuffles single-precision (32-bit) floating-point elements in `a`
1073/// within 128-bit lanes using the control in `b`.
1074///
1075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1076#[inline]
1077#[target_feature(enable = "avx")]
1078#[cfg_attr(test, assert_instr(vpermilps))]
1079#[stable(feature = "simd_x86", since = "1.27.0")]
1080pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1081    unsafe { vpermilps256(a, b.as_i32x8()) }
1082}
1083
1084/// Shuffles single-precision (32-bit) floating-point elements in `a`
1085/// using the control in `b`.
1086///
1087/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1088#[inline]
1089#[target_feature(enable = "avx")]
1090#[cfg_attr(test, assert_instr(vpermilps))]
1091#[stable(feature = "simd_x86", since = "1.27.0")]
1092pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1093    unsafe { vpermilps(a, b.as_i32x4()) }
1094}
1095
1096/// Shuffles single-precision (32-bit) floating-point elements in `a`
1097/// within 128-bit lanes using the control in `imm8`.
1098///
1099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1100#[inline]
1101#[target_feature(enable = "avx")]
1102#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1103#[rustc_legacy_const_generics(1)]
1104#[stable(feature = "simd_x86", since = "1.27.0")]
1105pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1106    static_assert_uimm_bits!(IMM8, 8);
1107    unsafe {
1108        simd_shuffle!(
1109            a,
1110            _mm256_undefined_ps(),
1111            [
1112                (IMM8 as u32 >> 0) & 0b11,
1113                (IMM8 as u32 >> 2) & 0b11,
1114                (IMM8 as u32 >> 4) & 0b11,
1115                (IMM8 as u32 >> 6) & 0b11,
1116                ((IMM8 as u32 >> 0) & 0b11) + 4,
1117                ((IMM8 as u32 >> 2) & 0b11) + 4,
1118                ((IMM8 as u32 >> 4) & 0b11) + 4,
1119                ((IMM8 as u32 >> 6) & 0b11) + 4,
1120            ],
1121        )
1122    }
1123}
1124
1125/// Shuffles single-precision (32-bit) floating-point elements in `a`
1126/// using the control in `imm8`.
1127///
1128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1129#[inline]
1130#[target_feature(enable = "avx")]
1131#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1132#[rustc_legacy_const_generics(1)]
1133#[stable(feature = "simd_x86", since = "1.27.0")]
1134pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1135    static_assert_uimm_bits!(IMM8, 8);
1136    unsafe {
1137        simd_shuffle!(
1138            a,
1139            _mm_undefined_ps(),
1140            [
1141                (IMM8 as u32 >> 0) & 0b11,
1142                (IMM8 as u32 >> 2) & 0b11,
1143                (IMM8 as u32 >> 4) & 0b11,
1144                (IMM8 as u32 >> 6) & 0b11,
1145            ],
1146        )
1147    }
1148}
1149
1150/// Shuffles double-precision (64-bit) floating-point elements in `a`
1151/// within 256-bit lanes using the control in `b`.
1152///
1153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1154#[inline]
1155#[target_feature(enable = "avx")]
1156#[cfg_attr(test, assert_instr(vpermilpd))]
1157#[stable(feature = "simd_x86", since = "1.27.0")]
1158pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1159    unsafe { vpermilpd256(a, b.as_i64x4()) }
1160}
1161
1162/// Shuffles double-precision (64-bit) floating-point elements in `a`
1163/// using the control in `b`.
1164///
1165/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1166#[inline]
1167#[target_feature(enable = "avx")]
1168#[cfg_attr(test, assert_instr(vpermilpd))]
1169#[stable(feature = "simd_x86", since = "1.27.0")]
1170pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1171    unsafe { vpermilpd(a, b.as_i64x2()) }
1172}
1173
1174/// Shuffles double-precision (64-bit) floating-point elements in `a`
1175/// within 128-bit lanes using the control in `imm8`.
1176///
1177/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1178#[inline]
1179#[target_feature(enable = "avx")]
1180#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1181#[rustc_legacy_const_generics(1)]
1182#[stable(feature = "simd_x86", since = "1.27.0")]
1183pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1184    static_assert_uimm_bits!(IMM4, 4);
1185    unsafe {
1186        simd_shuffle!(
1187            a,
1188            _mm256_undefined_pd(),
1189            [
1190                ((IMM4 as u32 >> 0) & 1),
1191                ((IMM4 as u32 >> 1) & 1),
1192                ((IMM4 as u32 >> 2) & 1) + 2,
1193                ((IMM4 as u32 >> 3) & 1) + 2,
1194            ],
1195        )
1196    }
1197}
1198
1199/// Shuffles double-precision (64-bit) floating-point elements in `a`
1200/// using the control in `imm8`.
1201///
1202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1203#[inline]
1204#[target_feature(enable = "avx")]
1205#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1206#[rustc_legacy_const_generics(1)]
1207#[stable(feature = "simd_x86", since = "1.27.0")]
1208pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1209    static_assert_uimm_bits!(IMM2, 2);
1210    unsafe {
1211        simd_shuffle!(
1212            a,
1213            _mm_undefined_pd(),
1214            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1215        )
1216    }
1217}
1218
1219/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1220/// floating-point elements) selected by `imm8` from `a` and `b`.
1221///
1222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1223#[inline]
1224#[target_feature(enable = "avx")]
1225#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1226#[rustc_legacy_const_generics(2)]
1227#[stable(feature = "simd_x86", since = "1.27.0")]
1228pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1229    static_assert_uimm_bits!(IMM8, 8);
1230    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
1231}
1232
1233/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1234/// floating-point elements) selected by `imm8` from `a` and `b`.
1235///
1236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1237#[inline]
1238#[target_feature(enable = "avx")]
1239#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1240#[rustc_legacy_const_generics(2)]
1241#[stable(feature = "simd_x86", since = "1.27.0")]
1242pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1243    static_assert_uimm_bits!(IMM8, 8);
1244    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
1245}
1246
1247/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1248/// from `a` and `b`.
1249///
1250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1251#[inline]
1252#[target_feature(enable = "avx")]
1253#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1254#[rustc_legacy_const_generics(2)]
1255#[stable(feature = "simd_x86", since = "1.27.0")]
1256pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1257    static_assert_uimm_bits!(IMM8, 8);
1258    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
1259}
1260
1261/// Broadcasts a single-precision (32-bit) floating-point element from memory
1262/// to all elements of the returned vector.
1263///
1264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1265#[inline]
1266#[target_feature(enable = "avx")]
1267#[cfg_attr(test, assert_instr(vbroadcastss))]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269#[allow(clippy::trivially_copy_pass_by_ref)]
1270pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1271    _mm256_set1_ps(*f)
1272}
1273
1274/// Broadcasts a single-precision (32-bit) floating-point element from memory
1275/// to all elements of the returned vector.
1276///
1277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1278#[inline]
1279#[target_feature(enable = "avx")]
1280#[cfg_attr(test, assert_instr(vbroadcastss))]
1281#[stable(feature = "simd_x86", since = "1.27.0")]
1282#[allow(clippy::trivially_copy_pass_by_ref)]
1283pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1284    _mm_set1_ps(*f)
1285}
1286
1287/// Broadcasts a double-precision (64-bit) floating-point element from memory
1288/// to all elements of the returned vector.
1289///
1290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1291#[inline]
1292#[target_feature(enable = "avx")]
1293#[cfg_attr(test, assert_instr(vbroadcastsd))]
1294#[stable(feature = "simd_x86", since = "1.27.0")]
1295#[allow(clippy::trivially_copy_pass_by_ref)]
1296pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1297    _mm256_set1_pd(*f)
1298}
1299
1300/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1301/// (32-bit) floating-point elements) to all elements of the returned vector.
1302///
1303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1304#[inline]
1305#[target_feature(enable = "avx")]
1306#[cfg_attr(test, assert_instr(vbroadcastf128))]
1307#[stable(feature = "simd_x86", since = "1.27.0")]
1308pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1309    simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])
1310}
1311
1312/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1313/// (64-bit) floating-point elements) to all elements of the returned vector.
1314///
1315/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1316#[inline]
1317#[target_feature(enable = "avx")]
1318#[cfg_attr(test, assert_instr(vbroadcastf128))]
1319#[stable(feature = "simd_x86", since = "1.27.0")]
1320pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1321    simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1])
1322}
1323
1324/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1325/// single-precision (32-bit) floating-point elements) from `b` into result
1326/// at the location specified by `imm8`.
1327///
1328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1329#[inline]
1330#[target_feature(enable = "avx")]
1331#[cfg_attr(
1332    all(test, not(target_env = "msvc")),
1333    assert_instr(vinsertf128, IMM1 = 1)
1334)]
1335#[rustc_legacy_const_generics(2)]
1336#[stable(feature = "simd_x86", since = "1.27.0")]
1337pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1338    static_assert_uimm_bits!(IMM1, 1);
1339    unsafe {
1340        simd_shuffle!(
1341            a,
1342            _mm256_castps128_ps256(b),
1343            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1344        )
1345    }
1346}
1347
1348/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1349/// double-precision (64-bit) floating-point elements) from `b` into result
1350/// at the location specified by `imm8`.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1353#[inline]
1354#[target_feature(enable = "avx")]
1355#[cfg_attr(
1356    all(test, not(target_env = "msvc")),
1357    assert_instr(vinsertf128, IMM1 = 1)
1358)]
1359#[rustc_legacy_const_generics(2)]
1360#[stable(feature = "simd_x86", since = "1.27.0")]
1361pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1362    static_assert_uimm_bits!(IMM1, 1);
1363    unsafe {
1364        simd_shuffle!(
1365            a,
1366            _mm256_castpd128_pd256(b),
1367            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1368        )
1369    }
1370}
1371
1372/// Copies `a` to result, then inserts 128 bits from `b` into result
1373/// at the location specified by `imm8`.
1374///
1375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1376#[inline]
1377#[target_feature(enable = "avx")]
1378#[cfg_attr(
1379    all(test, not(target_env = "msvc")),
1380    assert_instr(vinsertf128, IMM1 = 1)
1381)]
1382#[rustc_legacy_const_generics(2)]
1383#[stable(feature = "simd_x86", since = "1.27.0")]
1384pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1385    static_assert_uimm_bits!(IMM1, 1);
1386    unsafe {
1387        let dst: i64x4 = simd_shuffle!(
1388            a.as_i64x4(),
1389            _mm256_castsi128_si256(b).as_i64x4(),
1390            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1391        );
1392        transmute(dst)
1393    }
1394}
1395
1396/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1397/// at the location specified by `index`.
1398///
1399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1400#[inline]
1401#[target_feature(enable = "avx")]
1402// This intrinsic has no corresponding instruction.
1403#[rustc_legacy_const_generics(2)]
1404#[stable(feature = "simd_x86", since = "1.27.0")]
1405pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1406    static_assert_uimm_bits!(INDEX, 5);
1407    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1408}
1409
1410/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1411/// at the location specified by `index`.
1412///
1413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1414#[inline]
1415#[target_feature(enable = "avx")]
1416// This intrinsic has no corresponding instruction.
1417#[rustc_legacy_const_generics(2)]
1418#[stable(feature = "simd_x86", since = "1.27.0")]
1419pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1420    static_assert_uimm_bits!(INDEX, 4);
1421    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1422}
1423
1424/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1425/// at the location specified by `index`.
1426///
1427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1428#[inline]
1429#[target_feature(enable = "avx")]
1430// This intrinsic has no corresponding instruction.
1431#[rustc_legacy_const_generics(2)]
1432#[stable(feature = "simd_x86", since = "1.27.0")]
1433pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1434    static_assert_uimm_bits!(INDEX, 3);
1435    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1436}
1437
1438/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1439/// floating-point elements) from memory into result.
1440/// `mem_addr` must be aligned on a 32-byte boundary or a
1441/// general-protection exception may be generated.
1442///
1443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1444#[inline]
1445#[target_feature(enable = "avx")]
1446#[cfg_attr(test, assert_instr(vmovap))]
1447#[stable(feature = "simd_x86", since = "1.27.0")]
1448#[allow(clippy::cast_ptr_alignment)]
1449pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1450    *(mem_addr as *const __m256d)
1451}
1452
1453/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1454/// floating-point elements) from `a` into memory.
1455/// `mem_addr` must be aligned on a 32-byte boundary or a
1456/// general-protection exception may be generated.
1457///
1458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1459#[inline]
1460#[target_feature(enable = "avx")]
1461#[cfg_attr(test, assert_instr(vmovap))]
1462#[stable(feature = "simd_x86", since = "1.27.0")]
1463#[allow(clippy::cast_ptr_alignment)]
1464pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1465    *(mem_addr as *mut __m256d) = a;
1466}
1467
1468/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1469/// floating-point elements) from memory into result.
1470/// `mem_addr` must be aligned on a 32-byte boundary or a
1471/// general-protection exception may be generated.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1474#[inline]
1475#[target_feature(enable = "avx")]
1476#[cfg_attr(test, assert_instr(vmovaps))]
1477#[stable(feature = "simd_x86", since = "1.27.0")]
1478#[allow(clippy::cast_ptr_alignment)]
1479pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1480    *(mem_addr as *const __m256)
1481}
1482
1483/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1484/// floating-point elements) from `a` into memory.
1485/// `mem_addr` must be aligned on a 32-byte boundary or a
1486/// general-protection exception may be generated.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1489#[inline]
1490#[target_feature(enable = "avx")]
1491#[cfg_attr(test, assert_instr(vmovaps))]
1492#[stable(feature = "simd_x86", since = "1.27.0")]
1493#[allow(clippy::cast_ptr_alignment)]
1494pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1495    *(mem_addr as *mut __m256) = a;
1496}
1497
1498/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1499/// floating-point elements) from memory into result.
1500/// `mem_addr` does not need to be aligned on any particular boundary.
1501///
1502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1503#[inline]
1504#[target_feature(enable = "avx")]
1505#[cfg_attr(test, assert_instr(vmovup))]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1508    let mut dst = _mm256_undefined_pd();
1509    ptr::copy_nonoverlapping(
1510        mem_addr as *const u8,
1511        ptr::addr_of_mut!(dst) as *mut u8,
1512        mem::size_of::<__m256d>(),
1513    );
1514    dst
1515}
1516
1517/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1518/// floating-point elements) from `a` into memory.
1519/// `mem_addr` does not need to be aligned on any particular boundary.
1520///
1521/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1522#[inline]
1523#[target_feature(enable = "avx")]
1524#[cfg_attr(test, assert_instr(vmovup))]
1525#[stable(feature = "simd_x86", since = "1.27.0")]
1526pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1527    mem_addr.cast::<__m256d>().write_unaligned(a);
1528}
1529
1530/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1531/// floating-point elements) from memory into result.
1532/// `mem_addr` does not need to be aligned on any particular boundary.
1533///
1534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1535#[inline]
1536#[target_feature(enable = "avx")]
1537#[cfg_attr(test, assert_instr(vmovups))]
1538#[stable(feature = "simd_x86", since = "1.27.0")]
1539pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1540    let mut dst = _mm256_undefined_ps();
1541    ptr::copy_nonoverlapping(
1542        mem_addr as *const u8,
1543        ptr::addr_of_mut!(dst) as *mut u8,
1544        mem::size_of::<__m256>(),
1545    );
1546    dst
1547}
1548
1549/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1550/// floating-point elements) from `a` into memory.
1551/// `mem_addr` does not need to be aligned on any particular boundary.
1552///
1553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1554#[inline]
1555#[target_feature(enable = "avx")]
1556#[cfg_attr(test, assert_instr(vmovups))]
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1559    mem_addr.cast::<__m256>().write_unaligned(a);
1560}
1561
1562/// Loads 256-bits of integer data from memory into result.
1563/// `mem_addr` must be aligned on a 32-byte boundary or a
1564/// general-protection exception may be generated.
1565///
1566/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1567#[inline]
1568#[target_feature(enable = "avx")]
1569#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
1570#[stable(feature = "simd_x86", since = "1.27.0")]
1571pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1572    *mem_addr
1573}
1574
1575/// Stores 256-bits of integer data from `a` into memory.
1576/// `mem_addr` must be aligned on a 32-byte boundary or a
1577/// general-protection exception may be generated.
1578///
1579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1580#[inline]
1581#[target_feature(enable = "avx")]
1582#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
1583#[stable(feature = "simd_x86", since = "1.27.0")]
1584pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1585    *mem_addr = a;
1586}
1587
1588/// Loads 256-bits of integer data from memory into result.
1589/// `mem_addr` does not need to be aligned on any particular boundary.
1590///
1591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1592#[inline]
1593#[target_feature(enable = "avx")]
1594#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1597    let mut dst = _mm256_undefined_si256();
1598    ptr::copy_nonoverlapping(
1599        mem_addr as *const u8,
1600        ptr::addr_of_mut!(dst) as *mut u8,
1601        mem::size_of::<__m256i>(),
1602    );
1603    dst
1604}
1605
1606/// Stores 256-bits of integer data from `a` into memory.
1607/// `mem_addr` does not need to be aligned on any particular boundary.
1608///
1609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1610#[inline]
1611#[target_feature(enable = "avx")]
1612#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1613#[stable(feature = "simd_x86", since = "1.27.0")]
1614pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1615    mem_addr.write_unaligned(a);
1616}
1617
1618/// Loads packed double-precision (64-bit) floating-point elements from memory
1619/// into result using `mask` (elements are zeroed out when the high bit of the
1620/// corresponding element is not set).
1621///
1622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1623#[inline]
1624#[target_feature(enable = "avx")]
1625#[cfg_attr(test, assert_instr(vmaskmovpd))]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1628    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1629}
1630
1631/// Stores packed double-precision (64-bit) floating-point elements from `a`
1632/// into memory using `mask`.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1635#[inline]
1636#[target_feature(enable = "avx")]
1637#[cfg_attr(test, assert_instr(vmaskmovpd))]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1640    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1641}
1642
1643/// Loads packed double-precision (64-bit) floating-point elements from memory
1644/// into result using `mask` (elements are zeroed out when the high bit of the
1645/// corresponding element is not set).
1646///
1647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1648#[inline]
1649#[target_feature(enable = "avx")]
1650#[cfg_attr(test, assert_instr(vmaskmovpd))]
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1653    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1654}
1655
1656/// Stores packed double-precision (64-bit) floating-point elements from `a`
1657/// into memory using `mask`.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmaskmovpd))]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1665    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1666}
1667
1668/// Loads packed single-precision (32-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovps))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1678    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1679}
1680
1681/// Stores packed single-precision (32-bit) floating-point elements from `a`
1682/// into memory using `mask`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1685#[inline]
1686#[target_feature(enable = "avx")]
1687#[cfg_attr(test, assert_instr(vmaskmovps))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1690    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1691}
1692
1693/// Loads packed single-precision (32-bit) floating-point elements from memory
1694/// into result using `mask` (elements are zeroed out when the high bit of the
1695/// corresponding element is not set).
1696///
1697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1698#[inline]
1699#[target_feature(enable = "avx")]
1700#[cfg_attr(test, assert_instr(vmaskmovps))]
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1703    maskloadps(mem_addr as *const i8, mask.as_i32x4())
1704}
1705
1706/// Stores packed single-precision (32-bit) floating-point elements from `a`
1707/// into memory using `mask`.
1708///
1709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1710#[inline]
1711#[target_feature(enable = "avx")]
1712#[cfg_attr(test, assert_instr(vmaskmovps))]
1713#[stable(feature = "simd_x86", since = "1.27.0")]
1714pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1715    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1716}
1717
1718/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1719/// from `a`, and returns the results.
1720///
1721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1722#[inline]
1723#[target_feature(enable = "avx")]
1724#[cfg_attr(test, assert_instr(vmovshdup))]
1725#[stable(feature = "simd_x86", since = "1.27.0")]
1726pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1727    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1728}
1729
1730/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1731/// from `a`, and returns the results.
1732///
1733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1734#[inline]
1735#[target_feature(enable = "avx")]
1736#[cfg_attr(test, assert_instr(vmovsldup))]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1739    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1740}
1741
1742/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1743/// from `a`, and returns the results.
1744///
1745/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1746#[inline]
1747#[target_feature(enable = "avx")]
1748#[cfg_attr(test, assert_instr(vmovddup))]
1749#[stable(feature = "simd_x86", since = "1.27.0")]
1750pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1751    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1752}
1753
1754/// Loads 256-bits of integer data from unaligned memory into result.
1755/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1756/// data crosses a cache line boundary.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vlddqu))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1764    transmute(vlddqu(mem_addr as *const i8))
1765}
1766
1767/// Moves integer data from a 256-bit integer vector to a 32-byte
1768/// aligned memory location. To minimize caching, the data is flagged as
1769/// non-temporal (unlikely to be used again soon)
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1772///
1773/// # Safety of non-temporal stores
1774///
1775/// After using this intrinsic, but before any other access to the memory that this intrinsic
1776/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1777/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1778/// return.
1779///
1780/// See [`_mm_sfence`] for details.
1781#[inline]
1782#[target_feature(enable = "avx")]
1783#[cfg_attr(test, assert_instr(vmovntdq))]
1784#[stable(feature = "simd_x86", since = "1.27.0")]
1785pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1786    crate::arch::asm!(
1787        vps!("vmovntdq", ",{a}"),
1788        p = in(reg) mem_addr,
1789        a = in(ymm_reg) a,
1790        options(nostack, preserves_flags),
1791    );
1792}
1793
1794/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1795/// to a 32-byte aligned memory location. To minimize caching, the data is
1796/// flagged as non-temporal (unlikely to be used again soon).
1797///
1798/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1799///
1800/// # Safety of non-temporal stores
1801///
1802/// After using this intrinsic, but before any other access to the memory that this intrinsic
1803/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1804/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1805/// return.
1806///
1807/// See [`_mm_sfence`] for details.
1808#[inline]
1809#[target_feature(enable = "avx")]
1810#[cfg_attr(test, assert_instr(vmovntpd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[allow(clippy::cast_ptr_alignment)]
1813pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1814    crate::arch::asm!(
1815        vps!("vmovntpd", ",{a}"),
1816        p = in(reg) mem_addr,
1817        a = in(ymm_reg) a,
1818        options(nostack, preserves_flags),
1819    );
1820}
1821
1822/// Moves single-precision floating point values from a 256-bit vector
1823/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1824/// caching, the data is flagged as non-temporal (unlikely to be used again
1825/// soon).
1826///
1827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1828///
1829/// # Safety of non-temporal stores
1830///
1831/// After using this intrinsic, but before any other access to the memory that this intrinsic
1832/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1833/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1834/// return.
1835///
1836/// See [`_mm_sfence`] for details.
1837#[inline]
1838#[target_feature(enable = "avx")]
1839#[cfg_attr(test, assert_instr(vmovntps))]
1840#[stable(feature = "simd_x86", since = "1.27.0")]
1841#[allow(clippy::cast_ptr_alignment)]
1842pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1843    crate::arch::asm!(
1844        vps!("vmovntps", ",{a}"),
1845        p = in(reg) mem_addr,
1846        a = in(ymm_reg) a,
1847        options(nostack, preserves_flags),
1848    );
1849}
1850
1851/// Computes the approximate reciprocal of packed single-precision (32-bit)
1852/// floating-point elements in `a`, and returns the results. The maximum
1853/// relative error for this approximation is less than 1.5*2^-12.
1854///
1855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1856#[inline]
1857#[target_feature(enable = "avx")]
1858#[cfg_attr(test, assert_instr(vrcpps))]
1859#[stable(feature = "simd_x86", since = "1.27.0")]
1860pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1861    unsafe { vrcpps(a) }
1862}
1863
1864/// Computes the approximate reciprocal square root of packed single-precision
1865/// (32-bit) floating-point elements in `a`, and returns the results.
1866/// The maximum relative error for this approximation is less than 1.5*2^-12.
1867///
1868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1869#[inline]
1870#[target_feature(enable = "avx")]
1871#[cfg_attr(test, assert_instr(vrsqrtps))]
1872#[stable(feature = "simd_x86", since = "1.27.0")]
1873pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1874    unsafe { vrsqrtps(a) }
1875}
1876
1877/// Unpacks and interleave double-precision (64-bit) floating-point elements
1878/// from the high half of each 128-bit lane in `a` and `b`.
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1881#[inline]
1882#[target_feature(enable = "avx")]
1883#[cfg_attr(test, assert_instr(vunpckhpd))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1886    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1887}
1888
1889/// Unpacks and interleave single-precision (32-bit) floating-point elements
1890/// from the high half of each 128-bit lane in `a` and `b`.
1891///
1892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1893#[inline]
1894#[target_feature(enable = "avx")]
1895#[cfg_attr(test, assert_instr(vunpckhps))]
1896#[stable(feature = "simd_x86", since = "1.27.0")]
1897pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1898    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1899}
1900
1901/// Unpacks and interleave double-precision (64-bit) floating-point elements
1902/// from the low half of each 128-bit lane in `a` and `b`.
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1905#[inline]
1906#[target_feature(enable = "avx")]
1907#[cfg_attr(test, assert_instr(vunpcklpd))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1910    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1911}
1912
1913/// Unpacks and interleave single-precision (32-bit) floating-point elements
1914/// from the low half of each 128-bit lane in `a` and `b`.
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1917#[inline]
1918#[target_feature(enable = "avx")]
1919#[cfg_attr(test, assert_instr(vunpcklps))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1922    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1923}
1924
1925/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1926/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1927/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1928/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1931#[inline]
1932#[target_feature(enable = "avx")]
1933#[cfg_attr(test, assert_instr(vptest))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1936    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
1937}
1938
1939/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1940/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1941/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1942/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
1945#[inline]
1946#[target_feature(enable = "avx")]
1947#[cfg_attr(test, assert_instr(vptest))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1950    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
1951}
1952
1953/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1954/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1955/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1956/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1957/// `CF` values are zero, otherwise return 0.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
1960#[inline]
1961#[target_feature(enable = "avx")]
1962#[cfg_attr(test, assert_instr(vptest))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1965    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
1966}
1967
1968/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1969/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1970/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1971/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1972/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1973/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1974/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
1977#[inline]
1978#[target_feature(enable = "avx")]
1979#[cfg_attr(test, assert_instr(vtestpd))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1982    unsafe { vtestzpd256(a, b) }
1983}
1984
1985/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1986/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1987/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1988/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1989/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1990/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1991/// is zero, otherwise set `CF` to 0. Return the `CF` value.
1992///
1993/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
1994#[inline]
1995#[target_feature(enable = "avx")]
1996#[cfg_attr(test, assert_instr(vtestpd))]
1997#[stable(feature = "simd_x86", since = "1.27.0")]
1998pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1999    unsafe { vtestcpd256(a, b) }
2000}
2001
2002/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2003/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2004/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2005/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2006/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2007/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2008/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2009/// are zero, otherwise return 0.
2010///
2011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2012#[inline]
2013#[target_feature(enable = "avx")]
2014#[cfg_attr(test, assert_instr(vtestpd))]
2015#[stable(feature = "simd_x86", since = "1.27.0")]
2016pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2017    unsafe { vtestnzcpd256(a, b) }
2018}
2019
2020/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2021/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2022/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2023/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2024/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2025/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2026/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2027///
2028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2029#[inline]
2030#[target_feature(enable = "avx")]
2031#[cfg_attr(test, assert_instr(vtestpd))]
2032#[stable(feature = "simd_x86", since = "1.27.0")]
2033pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2034    unsafe { vtestzpd(a, b) }
2035}
2036
2037/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2038/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2039/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2040/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2041/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2042/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2043/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2046#[inline]
2047#[target_feature(enable = "avx")]
2048#[cfg_attr(test, assert_instr(vtestpd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2051    unsafe { vtestcpd(a, b) }
2052}
2053
2054/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2055/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2056/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2057/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2058/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2059/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2060/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2061/// are zero, otherwise return 0.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vtestpd))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2069    unsafe { vtestnzcpd(a, b) }
2070}
2071
2072/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2073/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2074/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2075/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2076/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2077/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2078/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2079///
2080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2081#[inline]
2082#[target_feature(enable = "avx")]
2083#[cfg_attr(test, assert_instr(vtestps))]
2084#[stable(feature = "simd_x86", since = "1.27.0")]
2085pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2086    unsafe { vtestzps256(a, b) }
2087}
2088
2089/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2090/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2091/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2092/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2093/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2094/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2095/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2096///
2097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2098#[inline]
2099#[target_feature(enable = "avx")]
2100#[cfg_attr(test, assert_instr(vtestps))]
2101#[stable(feature = "simd_x86", since = "1.27.0")]
2102pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2103    unsafe { vtestcps256(a, b) }
2104}
2105
2106/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2107/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2108/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2109/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2110/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2111/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2112/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2113/// are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vtestps))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2121    unsafe { vtestnzcps256(a, b) }
2122}
2123
2124/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestps))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2138    unsafe { vtestzps(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestps))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2155    unsafe { vtestcps(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestps))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2173    unsafe { vtestnzcps(a, b) }
2174}
2175
2176/// Sets each bit of the returned mask based on the most significant bit of the
2177/// corresponding packed double-precision (64-bit) floating-point element in
2178/// `a`.
2179///
2180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2181#[inline]
2182#[target_feature(enable = "avx")]
2183#[cfg_attr(test, assert_instr(vmovmskpd))]
2184#[stable(feature = "simd_x86", since = "1.27.0")]
2185pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2186    // Propagate the highest bit to the rest, because simd_bitmask
2187    // requires all-1 or all-0.
2188    unsafe {
2189        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2190        simd_bitmask::<i64x4, u8>(mask).into()
2191    }
2192}
2193
2194/// Sets each bit of the returned mask based on the most significant bit of the
2195/// corresponding packed single-precision (32-bit) floating-point element in
2196/// `a`.
2197///
2198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2199#[inline]
2200#[target_feature(enable = "avx")]
2201#[cfg_attr(test, assert_instr(vmovmskps))]
2202#[stable(feature = "simd_x86", since = "1.27.0")]
2203pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2204    // Propagate the highest bit to the rest, because simd_bitmask
2205    // requires all-1 or all-0.
2206    unsafe {
2207        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2208        simd_bitmask::<i32x8, u8>(mask).into()
2209    }
2210}
2211
2212/// Returns vector of type __m256d with all elements set to zero.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2215#[inline]
2216#[target_feature(enable = "avx")]
2217#[cfg_attr(test, assert_instr(vxorp))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219pub fn _mm256_setzero_pd() -> __m256d {
2220    const { unsafe { mem::zeroed() } }
2221}
2222
2223/// Returns vector of type __m256 with all elements set to zero.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2226#[inline]
2227#[target_feature(enable = "avx")]
2228#[cfg_attr(test, assert_instr(vxorps))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm256_setzero_ps() -> __m256 {
2231    const { unsafe { mem::zeroed() } }
2232}
2233
2234/// Returns vector of type __m256i with all elements set to zero.
2235///
2236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2237#[inline]
2238#[target_feature(enable = "avx")]
2239#[cfg_attr(test, assert_instr(vxor))]
2240#[stable(feature = "simd_x86", since = "1.27.0")]
2241pub fn _mm256_setzero_si256() -> __m256i {
2242    const { unsafe { mem::zeroed() } }
2243}
2244
2245/// Sets packed double-precision (64-bit) floating-point elements in returned
2246/// vector with the supplied values.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2249#[inline]
2250#[target_feature(enable = "avx")]
2251// This intrinsic has no corresponding instruction.
2252#[cfg_attr(test, assert_instr(vinsertf128))]
2253#[stable(feature = "simd_x86", since = "1.27.0")]
2254pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2255    _mm256_setr_pd(d, c, b, a)
2256}
2257
2258/// Sets packed single-precision (32-bit) floating-point elements in returned
2259/// vector with the supplied values.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264// This intrinsic has no corresponding instruction.
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2267    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2268}
2269
2270/// Sets packed 8-bit integers in returned vector with the supplied values.
2271///
2272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2273#[inline]
2274#[target_feature(enable = "avx")]
2275// This intrinsic has no corresponding instruction.
2276#[stable(feature = "simd_x86", since = "1.27.0")]
2277pub fn _mm256_set_epi8(
2278    e00: i8,
2279    e01: i8,
2280    e02: i8,
2281    e03: i8,
2282    e04: i8,
2283    e05: i8,
2284    e06: i8,
2285    e07: i8,
2286    e08: i8,
2287    e09: i8,
2288    e10: i8,
2289    e11: i8,
2290    e12: i8,
2291    e13: i8,
2292    e14: i8,
2293    e15: i8,
2294    e16: i8,
2295    e17: i8,
2296    e18: i8,
2297    e19: i8,
2298    e20: i8,
2299    e21: i8,
2300    e22: i8,
2301    e23: i8,
2302    e24: i8,
2303    e25: i8,
2304    e26: i8,
2305    e27: i8,
2306    e28: i8,
2307    e29: i8,
2308    e30: i8,
2309    e31: i8,
2310) -> __m256i {
2311    #[rustfmt::skip]
2312    _mm256_setr_epi8(
2313        e31, e30, e29, e28, e27, e26, e25, e24,
2314        e23, e22, e21, e20, e19, e18, e17, e16,
2315        e15, e14, e13, e12, e11, e10, e09, e08,
2316        e07, e06, e05, e04, e03, e02, e01, e00,
2317    )
2318}
2319
2320/// Sets packed 16-bit integers in returned vector with the supplied values.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2323#[inline]
2324#[target_feature(enable = "avx")]
2325// This intrinsic has no corresponding instruction.
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm256_set_epi16(
2328    e00: i16,
2329    e01: i16,
2330    e02: i16,
2331    e03: i16,
2332    e04: i16,
2333    e05: i16,
2334    e06: i16,
2335    e07: i16,
2336    e08: i16,
2337    e09: i16,
2338    e10: i16,
2339    e11: i16,
2340    e12: i16,
2341    e13: i16,
2342    e14: i16,
2343    e15: i16,
2344) -> __m256i {
2345    #[rustfmt::skip]
2346    _mm256_setr_epi16(
2347        e15, e14, e13, e12,
2348        e11, e10, e09, e08,
2349        e07, e06, e05, e04,
2350        e03, e02, e01, e00,
2351    )
2352}
2353
2354/// Sets packed 32-bit integers in returned vector with the supplied values.
2355///
2356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2357#[inline]
2358#[target_feature(enable = "avx")]
2359// This intrinsic has no corresponding instruction.
2360#[stable(feature = "simd_x86", since = "1.27.0")]
2361pub fn _mm256_set_epi32(
2362    e0: i32,
2363    e1: i32,
2364    e2: i32,
2365    e3: i32,
2366    e4: i32,
2367    e5: i32,
2368    e6: i32,
2369    e7: i32,
2370) -> __m256i {
2371    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2372}
2373
2374/// Sets packed 64-bit integers in returned vector with the supplied values.
2375///
2376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2377#[inline]
2378#[target_feature(enable = "avx")]
2379// This intrinsic has no corresponding instruction.
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2382    _mm256_setr_epi64x(d, c, b, a)
2383}
2384
2385/// Sets packed double-precision (64-bit) floating-point elements in returned
2386/// vector with the supplied values in reverse order.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391// This intrinsic has no corresponding instruction.
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2394    __m256d([a, b, c, d])
2395}
2396
2397/// Sets packed single-precision (32-bit) floating-point elements in returned
2398/// vector with the supplied values in reverse order.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403// This intrinsic has no corresponding instruction.
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2406    __m256([a, b, c, d, e, f, g, h])
2407}
2408
2409/// Sets packed 8-bit integers in returned vector with the supplied values in
2410/// reverse order.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415// This intrinsic has no corresponding instruction.
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417pub fn _mm256_setr_epi8(
2418    e00: i8,
2419    e01: i8,
2420    e02: i8,
2421    e03: i8,
2422    e04: i8,
2423    e05: i8,
2424    e06: i8,
2425    e07: i8,
2426    e08: i8,
2427    e09: i8,
2428    e10: i8,
2429    e11: i8,
2430    e12: i8,
2431    e13: i8,
2432    e14: i8,
2433    e15: i8,
2434    e16: i8,
2435    e17: i8,
2436    e18: i8,
2437    e19: i8,
2438    e20: i8,
2439    e21: i8,
2440    e22: i8,
2441    e23: i8,
2442    e24: i8,
2443    e25: i8,
2444    e26: i8,
2445    e27: i8,
2446    e28: i8,
2447    e29: i8,
2448    e30: i8,
2449    e31: i8,
2450) -> __m256i {
2451    unsafe {
2452        #[rustfmt::skip]
2453        transmute(i8x32::new(
2454            e00, e01, e02, e03, e04, e05, e06, e07,
2455            e08, e09, e10, e11, e12, e13, e14, e15,
2456            e16, e17, e18, e19, e20, e21, e22, e23,
2457            e24, e25, e26, e27, e28, e29, e30, e31,
2458        ))
2459    }
2460}
2461
2462/// Sets packed 16-bit integers in returned vector with the supplied values in
2463/// reverse order.
2464///
2465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2466#[inline]
2467#[target_feature(enable = "avx")]
2468// This intrinsic has no corresponding instruction.
2469#[stable(feature = "simd_x86", since = "1.27.0")]
2470pub fn _mm256_setr_epi16(
2471    e00: i16,
2472    e01: i16,
2473    e02: i16,
2474    e03: i16,
2475    e04: i16,
2476    e05: i16,
2477    e06: i16,
2478    e07: i16,
2479    e08: i16,
2480    e09: i16,
2481    e10: i16,
2482    e11: i16,
2483    e12: i16,
2484    e13: i16,
2485    e14: i16,
2486    e15: i16,
2487) -> __m256i {
2488    unsafe {
2489        #[rustfmt::skip]
2490        transmute(i16x16::new(
2491            e00, e01, e02, e03,
2492            e04, e05, e06, e07,
2493            e08, e09, e10, e11,
2494            e12, e13, e14, e15,
2495        ))
2496    }
2497}
2498
2499/// Sets packed 32-bit integers in returned vector with the supplied values in
2500/// reverse order.
2501///
2502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2503#[inline]
2504#[target_feature(enable = "avx")]
2505// This intrinsic has no corresponding instruction.
2506#[stable(feature = "simd_x86", since = "1.27.0")]
2507pub fn _mm256_setr_epi32(
2508    e0: i32,
2509    e1: i32,
2510    e2: i32,
2511    e3: i32,
2512    e4: i32,
2513    e5: i32,
2514    e6: i32,
2515    e7: i32,
2516) -> __m256i {
2517    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2518}
2519
2520/// Sets packed 64-bit integers in returned vector with the supplied values in
2521/// reverse order.
2522///
2523/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2524#[inline]
2525#[target_feature(enable = "avx")]
2526// This intrinsic has no corresponding instruction.
2527#[stable(feature = "simd_x86", since = "1.27.0")]
2528pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2529    unsafe { transmute(i64x4::new(a, b, c, d)) }
2530}
2531
2532/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2533/// elements of returned vector.
2534///
2535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2536#[inline]
2537#[target_feature(enable = "avx")]
2538// This intrinsic has no corresponding instruction.
2539#[stable(feature = "simd_x86", since = "1.27.0")]
2540pub fn _mm256_set1_pd(a: f64) -> __m256d {
2541    _mm256_setr_pd(a, a, a, a)
2542}
2543
2544/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2545/// elements of returned vector.
2546///
2547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2548#[inline]
2549#[target_feature(enable = "avx")]
2550// This intrinsic has no corresponding instruction.
2551#[stable(feature = "simd_x86", since = "1.27.0")]
2552pub fn _mm256_set1_ps(a: f32) -> __m256 {
2553    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2554}
2555
2556/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2557/// This intrinsic may generate the `vpbroadcastb`.
2558///
2559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2560#[inline]
2561#[target_feature(enable = "avx")]
2562// This intrinsic has no corresponding instruction.
2563#[stable(feature = "simd_x86", since = "1.27.0")]
2564pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2565    #[rustfmt::skip]
2566    _mm256_setr_epi8(
2567        a, a, a, a, a, a, a, a,
2568        a, a, a, a, a, a, a, a,
2569        a, a, a, a, a, a, a, a,
2570        a, a, a, a, a, a, a, a,
2571    )
2572}
2573
2574/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2575/// This intrinsic may generate the `vpbroadcastw`.
2576///
2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2578#[inline]
2579#[target_feature(enable = "avx")]
2580//#[cfg_attr(test, assert_instr(vpshufb))]
2581#[cfg_attr(test, assert_instr(vinsertf128))]
2582// This intrinsic has no corresponding instruction.
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2585    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2586}
2587
2588/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2589/// This intrinsic may generate the `vpbroadcastd`.
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2592#[inline]
2593#[target_feature(enable = "avx")]
2594// This intrinsic has no corresponding instruction.
2595#[stable(feature = "simd_x86", since = "1.27.0")]
2596pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2597    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2598}
2599
2600/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2601/// This intrinsic may generate the `vpbroadcastq`.
2602///
2603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2604#[inline]
2605#[target_feature(enable = "avx")]
2606#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2607#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2608// This intrinsic has no corresponding instruction.
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2611    _mm256_setr_epi64x(a, a, a, a)
2612}
2613
2614/// Cast vector of type __m256d to type __m256.
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2617#[inline]
2618#[target_feature(enable = "avx")]
2619// This intrinsic is only used for compilation and does not generate any
2620// instructions, thus it has zero latency.
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2623    unsafe { transmute(a) }
2624}
2625
2626/// Cast vector of type __m256 to type __m256d.
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2629#[inline]
2630#[target_feature(enable = "avx")]
2631// This intrinsic is only used for compilation and does not generate any
2632// instructions, thus it has zero latency.
2633#[stable(feature = "simd_x86", since = "1.27.0")]
2634pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2635    unsafe { transmute(a) }
2636}
2637
2638/// Casts vector of type __m256 to type __m256i.
2639///
2640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2641#[inline]
2642#[target_feature(enable = "avx")]
2643// This intrinsic is only used for compilation and does not generate any
2644// instructions, thus it has zero latency.
2645#[stable(feature = "simd_x86", since = "1.27.0")]
2646pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2647    unsafe { transmute(a) }
2648}
2649
2650/// Casts vector of type __m256i to type __m256.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2653#[inline]
2654#[target_feature(enable = "avx")]
2655// This intrinsic is only used for compilation and does not generate any
2656// instructions, thus it has zero latency.
2657#[stable(feature = "simd_x86", since = "1.27.0")]
2658pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2659    unsafe { transmute(a) }
2660}
2661
2662/// Casts vector of type __m256d to type __m256i.
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2665#[inline]
2666#[target_feature(enable = "avx")]
2667// This intrinsic is only used for compilation and does not generate any
2668// instructions, thus it has zero latency.
2669#[stable(feature = "simd_x86", since = "1.27.0")]
2670pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2671    unsafe { transmute(a) }
2672}
2673
2674/// Casts vector of type __m256i to type __m256d.
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2677#[inline]
2678#[target_feature(enable = "avx")]
2679// This intrinsic is only used for compilation and does not generate any
2680// instructions, thus it has zero latency.
2681#[stable(feature = "simd_x86", since = "1.27.0")]
2682pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2683    unsafe { transmute(a) }
2684}
2685
2686/// Casts vector of type __m256 to type __m128.
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2689#[inline]
2690#[target_feature(enable = "avx")]
2691// This intrinsic is only used for compilation and does not generate any
2692// instructions, thus it has zero latency.
2693#[stable(feature = "simd_x86", since = "1.27.0")]
2694pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2695    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2696}
2697
2698/// Casts vector of type __m256d to type __m128d.
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2701#[inline]
2702#[target_feature(enable = "avx")]
2703// This intrinsic is only used for compilation and does not generate any
2704// instructions, thus it has zero latency.
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2707    unsafe { simd_shuffle!(a, a, [0, 1]) }
2708}
2709
2710/// Casts vector of type __m256i to type __m128i.
2711///
2712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2713#[inline]
2714#[target_feature(enable = "avx")]
2715// This intrinsic is only used for compilation and does not generate any
2716// instructions, thus it has zero latency.
2717#[stable(feature = "simd_x86", since = "1.27.0")]
2718pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2719    unsafe {
2720        let a = a.as_i64x4();
2721        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2722        transmute(dst)
2723    }
2724}
2725
2726/// Casts vector of type __m128 to type __m256;
2727/// the upper 128 bits of the result are undefined.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic is only used for compilation and does not generate any
2733// instructions, thus it has zero latency.
2734#[stable(feature = "simd_x86", since = "1.27.0")]
2735pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2736    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2737}
2738
2739/// Casts vector of type __m128d to type __m256d;
2740/// the upper 128 bits of the result are undefined.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic is only used for compilation and does not generate any
2746// instructions, thus it has zero latency.
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2749    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2750}
2751
2752/// Casts vector of type __m128i to type __m256i;
2753/// the upper 128 bits of the result are undefined.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic is only used for compilation and does not generate any
2759// instructions, thus it has zero latency.
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2762    unsafe {
2763        let a = a.as_i64x2();
2764        let undefined = i64x2::ZERO;
2765        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2766        transmute(dst)
2767    }
2768}
2769
2770/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2771/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2772/// the value of the source vector. The upper 128 bits are set to zero.
2773///
2774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2775#[inline]
2776#[target_feature(enable = "avx")]
2777// This intrinsic is only used for compilation and does not generate any
2778// instructions, thus it has zero latency.
2779#[stable(feature = "simd_x86", since = "1.27.0")]
2780pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2781    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2782}
2783
2784/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2785/// The lower 128 bits contain the value of the source vector. The upper
2786/// 128 bits are set to zero.
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2789#[inline]
2790#[target_feature(enable = "avx")]
2791// This intrinsic is only used for compilation and does not generate any
2792// instructions, thus it has zero latency.
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2795    unsafe {
2796        let b = i64x2::ZERO;
2797        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2798        transmute(dst)
2799    }
2800}
2801
2802/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2803/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2804/// contain the value of the source vector. The upper 128 bits are set
2805/// to zero.
2806///
2807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2808#[inline]
2809#[target_feature(enable = "avx")]
2810// This intrinsic is only used for compilation and does not generate any
2811// instructions, thus it has zero latency.
2812#[stable(feature = "simd_x86", since = "1.27.0")]
2813pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2814    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2815}
2816
2817/// Returns vector of type `__m256` with indeterminate elements.
2818/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2819/// In practice, this is equivalent to [`mem::zeroed`].
2820///
2821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2822#[inline]
2823#[target_feature(enable = "avx")]
2824// This intrinsic has no corresponding instruction.
2825#[stable(feature = "simd_x86", since = "1.27.0")]
2826pub fn _mm256_undefined_ps() -> __m256 {
2827    const { unsafe { mem::zeroed() } }
2828}
2829
2830/// Returns vector of type `__m256d` with indeterminate elements.
2831/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2832/// In practice, this is equivalent to [`mem::zeroed`].
2833///
2834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2835#[inline]
2836#[target_feature(enable = "avx")]
2837// This intrinsic has no corresponding instruction.
2838#[stable(feature = "simd_x86", since = "1.27.0")]
2839pub fn _mm256_undefined_pd() -> __m256d {
2840    const { unsafe { mem::zeroed() } }
2841}
2842
2843/// Returns vector of type __m256i with with indeterminate elements.
2844/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2845/// In practice, this is equivalent to [`mem::zeroed`].
2846///
2847/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2848#[inline]
2849#[target_feature(enable = "avx")]
2850// This intrinsic has no corresponding instruction.
2851#[stable(feature = "simd_x86", since = "1.27.0")]
2852pub fn _mm256_undefined_si256() -> __m256i {
2853    const { unsafe { mem::zeroed() } }
2854}
2855
2856/// Sets packed __m256 returned vector with the supplied values.
2857///
2858/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2859#[inline]
2860#[target_feature(enable = "avx")]
2861#[cfg_attr(test, assert_instr(vinsertf128))]
2862#[stable(feature = "simd_x86", since = "1.27.0")]
2863pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2864    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2865}
2866
2867/// Sets packed __m256d returned vector with the supplied values.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2870#[inline]
2871#[target_feature(enable = "avx")]
2872#[cfg_attr(test, assert_instr(vinsertf128))]
2873#[stable(feature = "simd_x86", since = "1.27.0")]
2874pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2875    unsafe {
2876        let hi: __m128 = transmute(hi);
2877        let lo: __m128 = transmute(lo);
2878        transmute(_mm256_set_m128(hi, lo))
2879    }
2880}
2881
2882/// Sets packed __m256i returned vector with the supplied values.
2883///
2884/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2885#[inline]
2886#[target_feature(enable = "avx")]
2887#[cfg_attr(test, assert_instr(vinsertf128))]
2888#[stable(feature = "simd_x86", since = "1.27.0")]
2889pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2890    unsafe {
2891        let hi: __m128 = transmute(hi);
2892        let lo: __m128 = transmute(lo);
2893        transmute(_mm256_set_m128(hi, lo))
2894    }
2895}
2896
2897/// Sets packed __m256 returned vector with the supplied values.
2898///
2899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2900#[inline]
2901#[target_feature(enable = "avx")]
2902#[cfg_attr(test, assert_instr(vinsertf128))]
2903#[stable(feature = "simd_x86", since = "1.27.0")]
2904pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2905    _mm256_set_m128(hi, lo)
2906}
2907
2908/// Sets packed __m256d returned vector with the supplied values.
2909///
2910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2911#[inline]
2912#[target_feature(enable = "avx")]
2913#[cfg_attr(test, assert_instr(vinsertf128))]
2914#[stable(feature = "simd_x86", since = "1.27.0")]
2915pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2916    _mm256_set_m128d(hi, lo)
2917}
2918
2919/// Sets packed __m256i returned vector with the supplied values.
2920///
2921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
2922#[inline]
2923#[target_feature(enable = "avx")]
2924#[cfg_attr(test, assert_instr(vinsertf128))]
2925#[stable(feature = "simd_x86", since = "1.27.0")]
2926pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2927    _mm256_set_m128i(hi, lo)
2928}
2929
2930/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
2931/// floating-point elements) from memory, and combine them into a 256-bit
2932/// value.
2933/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2934///
2935/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
2936#[inline]
2937#[target_feature(enable = "avx")]
2938// This intrinsic has no corresponding instruction.
2939#[stable(feature = "simd_x86", since = "1.27.0")]
2940pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
2941    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
2942    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
2943}
2944
2945/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
2946/// floating-point elements) from memory, and combine them into a 256-bit
2947/// value.
2948/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2949///
2950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
2951#[inline]
2952#[target_feature(enable = "avx")]
2953// This intrinsic has no corresponding instruction.
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
2956    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
2957    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
2958}
2959
2960/// Loads two 128-bit values (composed of integer data) from memory, and combine
2961/// them into a 256-bit value.
2962/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2963///
2964/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
2965#[inline]
2966#[target_feature(enable = "avx")]
2967// This intrinsic has no corresponding instruction.
2968#[stable(feature = "simd_x86", since = "1.27.0")]
2969pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
2970    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
2971    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
2972}
2973
2974/// Stores the high and low 128-bit halves (each composed of 4 packed
2975/// single-precision (32-bit) floating-point elements) from `a` into memory two
2976/// different 128-bit locations.
2977/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2978///
2979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
2980#[inline]
2981#[target_feature(enable = "avx")]
2982// This intrinsic has no corresponding instruction.
2983#[stable(feature = "simd_x86", since = "1.27.0")]
2984pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
2985    let lo = _mm256_castps256_ps128(a);
2986    _mm_storeu_ps(loaddr, lo);
2987    let hi = _mm256_extractf128_ps::<1>(a);
2988    _mm_storeu_ps(hiaddr, hi);
2989}
2990
2991/// Stores the high and low 128-bit halves (each composed of 2 packed
2992/// double-precision (64-bit) floating-point elements) from `a` into memory two
2993/// different 128-bit locations.
2994/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2995///
2996/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
2997#[inline]
2998#[target_feature(enable = "avx")]
2999// This intrinsic has no corresponding instruction.
3000#[stable(feature = "simd_x86", since = "1.27.0")]
3001pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3002    let lo = _mm256_castpd256_pd128(a);
3003    _mm_storeu_pd(loaddr, lo);
3004    let hi = _mm256_extractf128_pd::<1>(a);
3005    _mm_storeu_pd(hiaddr, hi);
3006}
3007
3008/// Stores the high and low 128-bit halves (each composed of integer data) from
3009/// `a` into memory two different 128-bit locations.
3010/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3011///
3012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3013#[inline]
3014#[target_feature(enable = "avx")]
3015// This intrinsic has no corresponding instruction.
3016#[stable(feature = "simd_x86", since = "1.27.0")]
3017pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3018    let lo = _mm256_castsi256_si128(a);
3019    _mm_storeu_si128(loaddr, lo);
3020    let hi = _mm256_extractf128_si256::<1>(a);
3021    _mm_storeu_si128(hiaddr, hi);
3022}
3023
3024/// Returns the first element of the input vector of `[8 x float]`.
3025///
3026/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3027#[inline]
3028#[target_feature(enable = "avx")]
3029//#[cfg_attr(test, assert_instr(movss))] FIXME
3030#[stable(feature = "simd_x86", since = "1.27.0")]
3031pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3032    unsafe { simd_extract!(a, 0) }
3033}
3034
3035// LLVM intrinsics used in the above functions
3036#[allow(improper_ctypes)]
3037unsafe extern "C" {
3038    #[link_name = "llvm.x86.avx.round.pd.256"]
3039    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3040    #[link_name = "llvm.x86.avx.round.ps.256"]
3041    fn roundps256(a: __m256, b: i32) -> __m256;
3042    #[link_name = "llvm.x86.avx.dp.ps.256"]
3043    fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
3044    #[link_name = "llvm.x86.avx.hadd.pd.256"]
3045    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
3046    #[link_name = "llvm.x86.avx.hadd.ps.256"]
3047    fn vhaddps(a: __m256, b: __m256) -> __m256;
3048    #[link_name = "llvm.x86.avx.hsub.pd.256"]
3049    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
3050    #[link_name = "llvm.x86.avx.hsub.ps.256"]
3051    fn vhsubps(a: __m256, b: __m256) -> __m256;
3052    #[link_name = "llvm.x86.sse2.cmp.pd"]
3053    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3054    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3055    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3056    #[link_name = "llvm.x86.sse.cmp.ps"]
3057    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3058    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3059    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3060    #[link_name = "llvm.x86.sse2.cmp.sd"]
3061    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3062    #[link_name = "llvm.x86.sse.cmp.ss"]
3063    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3064    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3065    fn vcvtps2dq(a: __m256) -> i32x8;
3066    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3067    fn vcvttpd2dq(a: __m256d) -> i32x4;
3068    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3069    fn vcvtpd2dq(a: __m256d) -> i32x4;
3070    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3071    fn vcvttps2dq(a: __m256) -> i32x8;
3072    #[link_name = "llvm.x86.avx.vzeroall"]
3073    fn vzeroall();
3074    #[link_name = "llvm.x86.avx.vzeroupper"]
3075    fn vzeroupper();
3076    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3077    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3078    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3079    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3080    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3081    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3082    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3083    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3084    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
3085    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
3086    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
3087    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
3088    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
3089    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
3090    #[link_name = "llvm.x86.avx.maskload.pd.256"]
3091    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3092    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
3093    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3094    #[link_name = "llvm.x86.avx.maskload.pd"]
3095    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3096    #[link_name = "llvm.x86.avx.maskstore.pd"]
3097    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3098    #[link_name = "llvm.x86.avx.maskload.ps.256"]
3099    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3100    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
3101    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3102    #[link_name = "llvm.x86.avx.maskload.ps"]
3103    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3104    #[link_name = "llvm.x86.avx.maskstore.ps"]
3105    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
3106    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3107    fn vlddqu(mem_addr: *const i8) -> i8x32;
3108    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3109    fn vrcpps(a: __m256) -> __m256;
3110    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3111    fn vrsqrtps(a: __m256) -> __m256;
3112    #[link_name = "llvm.x86.avx.ptestz.256"]
3113    fn ptestz256(a: i64x4, b: i64x4) -> i32;
3114    #[link_name = "llvm.x86.avx.ptestc.256"]
3115    fn ptestc256(a: i64x4, b: i64x4) -> i32;
3116    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3117    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3118    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3119    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3120    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3121    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3122    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3123    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3124    #[link_name = "llvm.x86.avx.vtestz.pd"]
3125    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
3126    #[link_name = "llvm.x86.avx.vtestc.pd"]
3127    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
3128    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3129    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3130    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3131    fn vtestzps256(a: __m256, b: __m256) -> i32;
3132    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3133    fn vtestcps256(a: __m256, b: __m256) -> i32;
3134    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3135    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3136    #[link_name = "llvm.x86.avx.vtestz.ps"]
3137    fn vtestzps(a: __m128, b: __m128) -> i32;
3138    #[link_name = "llvm.x86.avx.vtestc.ps"]
3139    fn vtestcps(a: __m128, b: __m128) -> i32;
3140    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3141    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3142    #[link_name = "llvm.x86.avx.min.ps.256"]
3143    fn vminps(a: __m256, b: __m256) -> __m256;
3144    #[link_name = "llvm.x86.avx.max.ps.256"]
3145    fn vmaxps(a: __m256, b: __m256) -> __m256;
3146    #[link_name = "llvm.x86.avx.min.pd.256"]
3147    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3148    #[link_name = "llvm.x86.avx.max.pd.256"]
3149    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3150}
3151
3152#[cfg(test)]
3153mod tests {
3154    use crate::hint::black_box;
3155    use crate::ptr;
3156    use stdarch_test::simd_test;
3157
3158    use crate::core_arch::x86::*;
3159
3160    #[simd_test(enable = "avx")]
3161    unsafe fn test_mm256_add_pd() {
3162        let a = _mm256_setr_pd(1., 2., 3., 4.);
3163        let b = _mm256_setr_pd(5., 6., 7., 8.);
3164        let r = _mm256_add_pd(a, b);
3165        let e = _mm256_setr_pd(6., 8., 10., 12.);
3166        assert_eq_m256d(r, e);
3167    }
3168
3169    #[simd_test(enable = "avx")]
3170    unsafe fn test_mm256_add_ps() {
3171        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3172        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3173        let r = _mm256_add_ps(a, b);
3174        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3175        assert_eq_m256(r, e);
3176    }
3177
3178    #[simd_test(enable = "avx")]
3179    unsafe fn test_mm256_and_pd() {
3180        let a = _mm256_set1_pd(1.);
3181        let b = _mm256_set1_pd(0.6);
3182        let r = _mm256_and_pd(a, b);
3183        let e = _mm256_set1_pd(0.5);
3184        assert_eq_m256d(r, e);
3185    }
3186
3187    #[simd_test(enable = "avx")]
3188    unsafe fn test_mm256_and_ps() {
3189        let a = _mm256_set1_ps(1.);
3190        let b = _mm256_set1_ps(0.6);
3191        let r = _mm256_and_ps(a, b);
3192        let e = _mm256_set1_ps(0.5);
3193        assert_eq_m256(r, e);
3194    }
3195
3196    #[simd_test(enable = "avx")]
3197    unsafe fn test_mm256_or_pd() {
3198        let a = _mm256_set1_pd(1.);
3199        let b = _mm256_set1_pd(0.6);
3200        let r = _mm256_or_pd(a, b);
3201        let e = _mm256_set1_pd(1.2);
3202        assert_eq_m256d(r, e);
3203    }
3204
3205    #[simd_test(enable = "avx")]
3206    unsafe fn test_mm256_or_ps() {
3207        let a = _mm256_set1_ps(1.);
3208        let b = _mm256_set1_ps(0.6);
3209        let r = _mm256_or_ps(a, b);
3210        let e = _mm256_set1_ps(1.2);
3211        assert_eq_m256(r, e);
3212    }
3213
3214    #[simd_test(enable = "avx")]
3215    unsafe fn test_mm256_shuffle_pd() {
3216        let a = _mm256_setr_pd(1., 4., 5., 8.);
3217        let b = _mm256_setr_pd(2., 3., 6., 7.);
3218        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3219        let e = _mm256_setr_pd(4., 3., 8., 7.);
3220        assert_eq_m256d(r, e);
3221    }
3222
3223    #[simd_test(enable = "avx")]
3224    unsafe fn test_mm256_shuffle_ps() {
3225        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3226        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3227        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3228        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3229        assert_eq_m256(r, e);
3230    }
3231
3232    #[simd_test(enable = "avx")]
3233    unsafe fn test_mm256_andnot_pd() {
3234        let a = _mm256_set1_pd(0.);
3235        let b = _mm256_set1_pd(0.6);
3236        let r = _mm256_andnot_pd(a, b);
3237        assert_eq_m256d(r, b);
3238    }
3239
3240    #[simd_test(enable = "avx")]
3241    unsafe fn test_mm256_andnot_ps() {
3242        let a = _mm256_set1_ps(0.);
3243        let b = _mm256_set1_ps(0.6);
3244        let r = _mm256_andnot_ps(a, b);
3245        assert_eq_m256(r, b);
3246    }
3247
3248    #[simd_test(enable = "avx")]
3249    unsafe fn test_mm256_max_pd() {
3250        let a = _mm256_setr_pd(1., 4., 5., 8.);
3251        let b = _mm256_setr_pd(2., 3., 6., 7.);
3252        let r = _mm256_max_pd(a, b);
3253        let e = _mm256_setr_pd(2., 4., 6., 8.);
3254        assert_eq_m256d(r, e);
3255        // > If the values being compared are both 0.0s (of either sign), the
3256        // > value in the second operand (source operand) is returned.
3257        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3258        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3259        let wu: [u64; 4] = transmute(w);
3260        let xu: [u64; 4] = transmute(x);
3261        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3262        assert_eq!(xu, [0u64; 4]);
3263        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3264        // > second operand (source operand), either a NaN or a valid
3265        // > floating-point value, is written to the result.
3266        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3267        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3268        let yf: [f64; 4] = transmute(y);
3269        let zf: [f64; 4] = transmute(z);
3270        assert_eq!(yf, [0.0; 4]);
3271        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3272    }
3273
3274    #[simd_test(enable = "avx")]
3275    unsafe fn test_mm256_max_ps() {
3276        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3277        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3278        let r = _mm256_max_ps(a, b);
3279        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3280        assert_eq_m256(r, e);
3281        // > If the values being compared are both 0.0s (of either sign), the
3282        // > value in the second operand (source operand) is returned.
3283        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3284        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3285        let wu: [u32; 8] = transmute(w);
3286        let xu: [u32; 8] = transmute(x);
3287        assert_eq!(wu, [0x8000_0000u32; 8]);
3288        assert_eq!(xu, [0u32; 8]);
3289        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3290        // > second operand (source operand), either a NaN or a valid
3291        // > floating-point value, is written to the result.
3292        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3293        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3294        let yf: [f32; 8] = transmute(y);
3295        let zf: [f32; 8] = transmute(z);
3296        assert_eq!(yf, [0.0; 8]);
3297        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3298    }
3299
3300    #[simd_test(enable = "avx")]
3301    unsafe fn test_mm256_min_pd() {
3302        let a = _mm256_setr_pd(1., 4., 5., 8.);
3303        let b = _mm256_setr_pd(2., 3., 6., 7.);
3304        let r = _mm256_min_pd(a, b);
3305        let e = _mm256_setr_pd(1., 3., 5., 7.);
3306        assert_eq_m256d(r, e);
3307        // > If the values being compared are both 0.0s (of either sign), the
3308        // > value in the second operand (source operand) is returned.
3309        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3310        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3311        let wu: [u64; 4] = transmute(w);
3312        let xu: [u64; 4] = transmute(x);
3313        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3314        assert_eq!(xu, [0u64; 4]);
3315        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3316        // > second operand (source operand), either a NaN or a valid
3317        // > floating-point value, is written to the result.
3318        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3319        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3320        let yf: [f64; 4] = transmute(y);
3321        let zf: [f64; 4] = transmute(z);
3322        assert_eq!(yf, [0.0; 4]);
3323        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3324    }
3325
3326    #[simd_test(enable = "avx")]
3327    unsafe fn test_mm256_min_ps() {
3328        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3329        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3330        let r = _mm256_min_ps(a, b);
3331        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3332        assert_eq_m256(r, e);
3333        // > If the values being compared are both 0.0s (of either sign), the
3334        // > value in the second operand (source operand) is returned.
3335        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3336        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3337        let wu: [u32; 8] = transmute(w);
3338        let xu: [u32; 8] = transmute(x);
3339        assert_eq!(wu, [0x8000_0000u32; 8]);
3340        assert_eq!(xu, [0u32; 8]);
3341        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3342        // > second operand (source operand), either a NaN or a valid
3343        // > floating-point value, is written to the result.
3344        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3345        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3346        let yf: [f32; 8] = transmute(y);
3347        let zf: [f32; 8] = transmute(z);
3348        assert_eq!(yf, [0.0; 8]);
3349        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3350    }
3351
3352    #[simd_test(enable = "avx")]
3353    unsafe fn test_mm256_mul_pd() {
3354        let a = _mm256_setr_pd(1., 2., 3., 4.);
3355        let b = _mm256_setr_pd(5., 6., 7., 8.);
3356        let r = _mm256_mul_pd(a, b);
3357        let e = _mm256_setr_pd(5., 12., 21., 32.);
3358        assert_eq_m256d(r, e);
3359    }
3360
3361    #[simd_test(enable = "avx")]
3362    unsafe fn test_mm256_mul_ps() {
3363        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3364        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3365        let r = _mm256_mul_ps(a, b);
3366        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3367        assert_eq_m256(r, e);
3368    }
3369
3370    #[simd_test(enable = "avx")]
3371    unsafe fn test_mm256_addsub_pd() {
3372        let a = _mm256_setr_pd(1., 2., 3., 4.);
3373        let b = _mm256_setr_pd(5., 6., 7., 8.);
3374        let r = _mm256_addsub_pd(a, b);
3375        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3376        assert_eq_m256d(r, e);
3377    }
3378
3379    #[simd_test(enable = "avx")]
3380    unsafe fn test_mm256_addsub_ps() {
3381        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3382        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3383        let r = _mm256_addsub_ps(a, b);
3384        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3385        assert_eq_m256(r, e);
3386    }
3387
3388    #[simd_test(enable = "avx")]
3389    unsafe fn test_mm256_sub_pd() {
3390        let a = _mm256_setr_pd(1., 2., 3., 4.);
3391        let b = _mm256_setr_pd(5., 6., 7., 8.);
3392        let r = _mm256_sub_pd(a, b);
3393        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3394        assert_eq_m256d(r, e);
3395    }
3396
3397    #[simd_test(enable = "avx")]
3398    unsafe fn test_mm256_sub_ps() {
3399        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3400        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3401        let r = _mm256_sub_ps(a, b);
3402        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3403        assert_eq_m256(r, e);
3404    }
3405
3406    #[simd_test(enable = "avx")]
3407    unsafe fn test_mm256_round_pd() {
3408        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3409        let result_closest = _mm256_round_pd::<0b0000>(a);
3410        let result_down = _mm256_round_pd::<0b0001>(a);
3411        let result_up = _mm256_round_pd::<0b0010>(a);
3412        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3413        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3414        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3415        assert_eq_m256d(result_closest, expected_closest);
3416        assert_eq_m256d(result_down, expected_down);
3417        assert_eq_m256d(result_up, expected_up);
3418    }
3419
3420    #[simd_test(enable = "avx")]
3421    unsafe fn test_mm256_floor_pd() {
3422        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3423        let result_down = _mm256_floor_pd(a);
3424        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3425        assert_eq_m256d(result_down, expected_down);
3426    }
3427
3428    #[simd_test(enable = "avx")]
3429    unsafe fn test_mm256_ceil_pd() {
3430        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3431        let result_up = _mm256_ceil_pd(a);
3432        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3433        assert_eq_m256d(result_up, expected_up);
3434    }
3435
3436    #[simd_test(enable = "avx")]
3437    unsafe fn test_mm256_round_ps() {
3438        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3439        let result_closest = _mm256_round_ps::<0b0000>(a);
3440        let result_down = _mm256_round_ps::<0b0001>(a);
3441        let result_up = _mm256_round_ps::<0b0010>(a);
3442        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3443        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3444        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3445        assert_eq_m256(result_closest, expected_closest);
3446        assert_eq_m256(result_down, expected_down);
3447        assert_eq_m256(result_up, expected_up);
3448    }
3449
3450    #[simd_test(enable = "avx")]
3451    unsafe fn test_mm256_floor_ps() {
3452        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3453        let result_down = _mm256_floor_ps(a);
3454        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3455        assert_eq_m256(result_down, expected_down);
3456    }
3457
3458    #[simd_test(enable = "avx")]
3459    unsafe fn test_mm256_ceil_ps() {
3460        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3461        let result_up = _mm256_ceil_ps(a);
3462        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3463        assert_eq_m256(result_up, expected_up);
3464    }
3465
3466    #[simd_test(enable = "avx")]
3467    unsafe fn test_mm256_sqrt_pd() {
3468        let a = _mm256_setr_pd(4., 9., 16., 25.);
3469        let r = _mm256_sqrt_pd(a);
3470        let e = _mm256_setr_pd(2., 3., 4., 5.);
3471        assert_eq_m256d(r, e);
3472    }
3473
3474    #[simd_test(enable = "avx")]
3475    unsafe fn test_mm256_sqrt_ps() {
3476        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3477        let r = _mm256_sqrt_ps(a);
3478        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3479        assert_eq_m256(r, e);
3480    }
3481
3482    #[simd_test(enable = "avx")]
3483    unsafe fn test_mm256_div_ps() {
3484        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3485        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3486        let r = _mm256_div_ps(a, b);
3487        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3488        assert_eq_m256(r, e);
3489    }
3490
3491    #[simd_test(enable = "avx")]
3492    unsafe fn test_mm256_div_pd() {
3493        let a = _mm256_setr_pd(4., 9., 16., 25.);
3494        let b = _mm256_setr_pd(4., 3., 2., 5.);
3495        let r = _mm256_div_pd(a, b);
3496        let e = _mm256_setr_pd(1., 3., 8., 5.);
3497        assert_eq_m256d(r, e);
3498    }
3499
3500    #[simd_test(enable = "avx")]
3501    unsafe fn test_mm256_blend_pd() {
3502        let a = _mm256_setr_pd(4., 9., 16., 25.);
3503        let b = _mm256_setr_pd(4., 3., 2., 5.);
3504        let r = _mm256_blend_pd::<0x0>(a, b);
3505        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3506        let r = _mm256_blend_pd::<0x3>(a, b);
3507        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3508        let r = _mm256_blend_pd::<0xF>(a, b);
3509        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3510    }
3511
3512    #[simd_test(enable = "avx")]
3513    unsafe fn test_mm256_blend_ps() {
3514        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3515        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3516        let r = _mm256_blend_ps::<0x0>(a, b);
3517        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3518        let r = _mm256_blend_ps::<0x3>(a, b);
3519        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3520        let r = _mm256_blend_ps::<0xF>(a, b);
3521        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3522    }
3523
3524    #[simd_test(enable = "avx")]
3525    unsafe fn test_mm256_blendv_pd() {
3526        let a = _mm256_setr_pd(4., 9., 16., 25.);
3527        let b = _mm256_setr_pd(4., 3., 2., 5.);
3528        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3529        let r = _mm256_blendv_pd(a, b, c);
3530        let e = _mm256_setr_pd(4., 9., 2., 5.);
3531        assert_eq_m256d(r, e);
3532    }
3533
3534    #[simd_test(enable = "avx")]
3535    unsafe fn test_mm256_blendv_ps() {
3536        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3537        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3538        #[rustfmt::skip]
3539        let c = _mm256_setr_ps(
3540            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3541        );
3542        let r = _mm256_blendv_ps(a, b, c);
3543        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3544        assert_eq_m256(r, e);
3545    }
3546
3547    #[simd_test(enable = "avx")]
3548    unsafe fn test_mm256_dp_ps() {
3549        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3550        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3551        let r = _mm256_dp_ps::<0xFF>(a, b);
3552        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3553        assert_eq_m256(r, e);
3554    }
3555
3556    #[simd_test(enable = "avx")]
3557    unsafe fn test_mm256_hadd_pd() {
3558        let a = _mm256_setr_pd(4., 9., 16., 25.);
3559        let b = _mm256_setr_pd(4., 3., 2., 5.);
3560        let r = _mm256_hadd_pd(a, b);
3561        let e = _mm256_setr_pd(13., 7., 41., 7.);
3562        assert_eq_m256d(r, e);
3563
3564        let a = _mm256_setr_pd(1., 2., 3., 4.);
3565        let b = _mm256_setr_pd(5., 6., 7., 8.);
3566        let r = _mm256_hadd_pd(a, b);
3567        let e = _mm256_setr_pd(3., 11., 7., 15.);
3568        assert_eq_m256d(r, e);
3569    }
3570
3571    #[simd_test(enable = "avx")]
3572    unsafe fn test_mm256_hadd_ps() {
3573        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3574        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3575        let r = _mm256_hadd_ps(a, b);
3576        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3577        assert_eq_m256(r, e);
3578
3579        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3580        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3581        let r = _mm256_hadd_ps(a, b);
3582        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3583        assert_eq_m256(r, e);
3584    }
3585
3586    #[simd_test(enable = "avx")]
3587    unsafe fn test_mm256_hsub_pd() {
3588        let a = _mm256_setr_pd(4., 9., 16., 25.);
3589        let b = _mm256_setr_pd(4., 3., 2., 5.);
3590        let r = _mm256_hsub_pd(a, b);
3591        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3592        assert_eq_m256d(r, e);
3593
3594        let a = _mm256_setr_pd(1., 2., 3., 4.);
3595        let b = _mm256_setr_pd(5., 6., 7., 8.);
3596        let r = _mm256_hsub_pd(a, b);
3597        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3598        assert_eq_m256d(r, e);
3599    }
3600
3601    #[simd_test(enable = "avx")]
3602    unsafe fn test_mm256_hsub_ps() {
3603        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3604        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3605        let r = _mm256_hsub_ps(a, b);
3606        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3607        assert_eq_m256(r, e);
3608
3609        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3610        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3611        let r = _mm256_hsub_ps(a, b);
3612        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3613        assert_eq_m256(r, e);
3614    }
3615
3616    #[simd_test(enable = "avx")]
3617    unsafe fn test_mm256_xor_pd() {
3618        let a = _mm256_setr_pd(4., 9., 16., 25.);
3619        let b = _mm256_set1_pd(0.);
3620        let r = _mm256_xor_pd(a, b);
3621        assert_eq_m256d(r, a);
3622    }
3623
3624    #[simd_test(enable = "avx")]
3625    unsafe fn test_mm256_xor_ps() {
3626        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3627        let b = _mm256_set1_ps(0.);
3628        let r = _mm256_xor_ps(a, b);
3629        assert_eq_m256(r, a);
3630    }
3631
3632    #[simd_test(enable = "avx")]
3633    unsafe fn test_mm_cmp_pd() {
3634        let a = _mm_setr_pd(4., 9.);
3635        let b = _mm_setr_pd(4., 3.);
3636        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3637        assert!(get_m128d(r, 0).is_nan());
3638        assert!(get_m128d(r, 1).is_nan());
3639    }
3640
3641    #[simd_test(enable = "avx")]
3642    unsafe fn test_mm256_cmp_pd() {
3643        let a = _mm256_setr_pd(1., 2., 3., 4.);
3644        let b = _mm256_setr_pd(5., 6., 7., 8.);
3645        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3646        let e = _mm256_set1_pd(0.);
3647        assert_eq_m256d(r, e);
3648    }
3649
3650    #[simd_test(enable = "avx")]
3651    unsafe fn test_mm_cmp_ps() {
3652        let a = _mm_setr_ps(4., 3., 2., 5.);
3653        let b = _mm_setr_ps(4., 9., 16., 25.);
3654        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3655        assert!(get_m128(r, 0).is_nan());
3656        assert_eq!(get_m128(r, 1), 0.);
3657        assert_eq!(get_m128(r, 2), 0.);
3658        assert_eq!(get_m128(r, 3), 0.);
3659    }
3660
3661    #[simd_test(enable = "avx")]
3662    unsafe fn test_mm256_cmp_ps() {
3663        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3664        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3665        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3666        let e = _mm256_set1_ps(0.);
3667        assert_eq_m256(r, e);
3668    }
3669
3670    #[simd_test(enable = "avx")]
3671    unsafe fn test_mm_cmp_sd() {
3672        let a = _mm_setr_pd(4., 9.);
3673        let b = _mm_setr_pd(4., 3.);
3674        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3675        assert!(get_m128d(r, 0).is_nan());
3676        assert_eq!(get_m128d(r, 1), 9.);
3677    }
3678
3679    #[simd_test(enable = "avx")]
3680    unsafe fn test_mm_cmp_ss() {
3681        let a = _mm_setr_ps(4., 3., 2., 5.);
3682        let b = _mm_setr_ps(4., 9., 16., 25.);
3683        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3684        assert!(get_m128(r, 0).is_nan());
3685        assert_eq!(get_m128(r, 1), 3.);
3686        assert_eq!(get_m128(r, 2), 2.);
3687        assert_eq!(get_m128(r, 3), 5.);
3688    }
3689
3690    #[simd_test(enable = "avx")]
3691    unsafe fn test_mm256_cvtepi32_pd() {
3692        let a = _mm_setr_epi32(4, 9, 16, 25);
3693        let r = _mm256_cvtepi32_pd(a);
3694        let e = _mm256_setr_pd(4., 9., 16., 25.);
3695        assert_eq_m256d(r, e);
3696    }
3697
3698    #[simd_test(enable = "avx")]
3699    unsafe fn test_mm256_cvtepi32_ps() {
3700        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3701        let r = _mm256_cvtepi32_ps(a);
3702        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3703        assert_eq_m256(r, e);
3704    }
3705
3706    #[simd_test(enable = "avx")]
3707    unsafe fn test_mm256_cvtpd_ps() {
3708        let a = _mm256_setr_pd(4., 9., 16., 25.);
3709        let r = _mm256_cvtpd_ps(a);
3710        let e = _mm_setr_ps(4., 9., 16., 25.);
3711        assert_eq_m128(r, e);
3712    }
3713
3714    #[simd_test(enable = "avx")]
3715    unsafe fn test_mm256_cvtps_epi32() {
3716        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3717        let r = _mm256_cvtps_epi32(a);
3718        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3719        assert_eq_m256i(r, e);
3720    }
3721
3722    #[simd_test(enable = "avx")]
3723    unsafe fn test_mm256_cvtps_pd() {
3724        let a = _mm_setr_ps(4., 9., 16., 25.);
3725        let r = _mm256_cvtps_pd(a);
3726        let e = _mm256_setr_pd(4., 9., 16., 25.);
3727        assert_eq_m256d(r, e);
3728    }
3729
3730    #[simd_test(enable = "avx")]
3731    unsafe fn test_mm256_cvtsd_f64() {
3732        let a = _mm256_setr_pd(1., 2., 3., 4.);
3733        let r = _mm256_cvtsd_f64(a);
3734        assert_eq!(r, 1.);
3735    }
3736
3737    #[simd_test(enable = "avx")]
3738    unsafe fn test_mm256_cvttpd_epi32() {
3739        let a = _mm256_setr_pd(4., 9., 16., 25.);
3740        let r = _mm256_cvttpd_epi32(a);
3741        let e = _mm_setr_epi32(4, 9, 16, 25);
3742        assert_eq_m128i(r, e);
3743    }
3744
3745    #[simd_test(enable = "avx")]
3746    unsafe fn test_mm256_cvtpd_epi32() {
3747        let a = _mm256_setr_pd(4., 9., 16., 25.);
3748        let r = _mm256_cvtpd_epi32(a);
3749        let e = _mm_setr_epi32(4, 9, 16, 25);
3750        assert_eq_m128i(r, e);
3751    }
3752
3753    #[simd_test(enable = "avx")]
3754    unsafe fn test_mm256_cvttps_epi32() {
3755        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3756        let r = _mm256_cvttps_epi32(a);
3757        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3758        assert_eq_m256i(r, e);
3759    }
3760
3761    #[simd_test(enable = "avx")]
3762    unsafe fn test_mm256_extractf128_ps() {
3763        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3764        let r = _mm256_extractf128_ps::<0>(a);
3765        let e = _mm_setr_ps(4., 3., 2., 5.);
3766        assert_eq_m128(r, e);
3767    }
3768
3769    #[simd_test(enable = "avx")]
3770    unsafe fn test_mm256_extractf128_pd() {
3771        let a = _mm256_setr_pd(4., 3., 2., 5.);
3772        let r = _mm256_extractf128_pd::<0>(a);
3773        let e = _mm_setr_pd(4., 3.);
3774        assert_eq_m128d(r, e);
3775    }
3776
3777    #[simd_test(enable = "avx")]
3778    unsafe fn test_mm256_extractf128_si256() {
3779        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3780        let r = _mm256_extractf128_si256::<0>(a);
3781        let e = _mm_setr_epi64x(4, 3);
3782        assert_eq_m128i(r, e);
3783    }
3784
3785    #[simd_test(enable = "avx")]
3786    unsafe fn test_mm256_extract_epi32() {
3787        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3788        let r1 = _mm256_extract_epi32::<0>(a);
3789        let r2 = _mm256_extract_epi32::<3>(a);
3790        assert_eq!(r1, -1);
3791        assert_eq!(r2, 3);
3792    }
3793
3794    #[simd_test(enable = "avx")]
3795    unsafe fn test_mm256_cvtsi256_si32() {
3796        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3797        let r = _mm256_cvtsi256_si32(a);
3798        assert_eq!(r, 1);
3799    }
3800
3801    #[simd_test(enable = "avx")]
3802    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3803    unsafe fn test_mm256_zeroall() {
3804        _mm256_zeroall();
3805    }
3806
3807    #[simd_test(enable = "avx")]
3808    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3809    unsafe fn test_mm256_zeroupper() {
3810        _mm256_zeroupper();
3811    }
3812
3813    #[simd_test(enable = "avx")]
3814    unsafe fn test_mm256_permutevar_ps() {
3815        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3816        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3817        let r = _mm256_permutevar_ps(a, b);
3818        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3819        assert_eq_m256(r, e);
3820    }
3821
3822    #[simd_test(enable = "avx")]
3823    unsafe fn test_mm_permutevar_ps() {
3824        let a = _mm_setr_ps(4., 3., 2., 5.);
3825        let b = _mm_setr_epi32(1, 2, 3, 4);
3826        let r = _mm_permutevar_ps(a, b);
3827        let e = _mm_setr_ps(3., 2., 5., 4.);
3828        assert_eq_m128(r, e);
3829    }
3830
3831    #[simd_test(enable = "avx")]
3832    unsafe fn test_mm256_permute_ps() {
3833        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3834        let r = _mm256_permute_ps::<0x1b>(a);
3835        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3836        assert_eq_m256(r, e);
3837    }
3838
3839    #[simd_test(enable = "avx")]
3840    unsafe fn test_mm_permute_ps() {
3841        let a = _mm_setr_ps(4., 3., 2., 5.);
3842        let r = _mm_permute_ps::<0x1b>(a);
3843        let e = _mm_setr_ps(5., 2., 3., 4.);
3844        assert_eq_m128(r, e);
3845    }
3846
3847    #[simd_test(enable = "avx")]
3848    unsafe fn test_mm256_permutevar_pd() {
3849        let a = _mm256_setr_pd(4., 3., 2., 5.);
3850        let b = _mm256_setr_epi64x(1, 2, 3, 4);
3851        let r = _mm256_permutevar_pd(a, b);
3852        let e = _mm256_setr_pd(4., 3., 5., 2.);
3853        assert_eq_m256d(r, e);
3854    }
3855
3856    #[simd_test(enable = "avx")]
3857    unsafe fn test_mm_permutevar_pd() {
3858        let a = _mm_setr_pd(4., 3.);
3859        let b = _mm_setr_epi64x(3, 0);
3860        let r = _mm_permutevar_pd(a, b);
3861        let e = _mm_setr_pd(3., 4.);
3862        assert_eq_m128d(r, e);
3863    }
3864
3865    #[simd_test(enable = "avx")]
3866    unsafe fn test_mm256_permute_pd() {
3867        let a = _mm256_setr_pd(4., 3., 2., 5.);
3868        let r = _mm256_permute_pd::<5>(a);
3869        let e = _mm256_setr_pd(3., 4., 5., 2.);
3870        assert_eq_m256d(r, e);
3871    }
3872
3873    #[simd_test(enable = "avx")]
3874    unsafe fn test_mm_permute_pd() {
3875        let a = _mm_setr_pd(4., 3.);
3876        let r = _mm_permute_pd::<1>(a);
3877        let e = _mm_setr_pd(3., 4.);
3878        assert_eq_m128d(r, e);
3879    }
3880
3881    #[simd_test(enable = "avx")]
3882    unsafe fn test_mm256_permute2f128_ps() {
3883        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3884        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3885        let r = _mm256_permute2f128_ps::<0x13>(a, b);
3886        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3887        assert_eq_m256(r, e);
3888    }
3889
3890    #[simd_test(enable = "avx")]
3891    unsafe fn test_mm256_permute2f128_pd() {
3892        let a = _mm256_setr_pd(1., 2., 3., 4.);
3893        let b = _mm256_setr_pd(5., 6., 7., 8.);
3894        let r = _mm256_permute2f128_pd::<0x31>(a, b);
3895        let e = _mm256_setr_pd(3., 4., 7., 8.);
3896        assert_eq_m256d(r, e);
3897    }
3898
3899    #[simd_test(enable = "avx")]
3900    unsafe fn test_mm256_permute2f128_si256() {
3901        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3902        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
3903        let r = _mm256_permute2f128_si256::<0x20>(a, b);
3904        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3905        assert_eq_m256i(r, e);
3906    }
3907
3908    #[simd_test(enable = "avx")]
3909    unsafe fn test_mm256_broadcast_ss() {
3910        let r = _mm256_broadcast_ss(&3.);
3911        let e = _mm256_set1_ps(3.);
3912        assert_eq_m256(r, e);
3913    }
3914
3915    #[simd_test(enable = "avx")]
3916    unsafe fn test_mm_broadcast_ss() {
3917        let r = _mm_broadcast_ss(&3.);
3918        let e = _mm_set1_ps(3.);
3919        assert_eq_m128(r, e);
3920    }
3921
3922    #[simd_test(enable = "avx")]
3923    unsafe fn test_mm256_broadcast_sd() {
3924        let r = _mm256_broadcast_sd(&3.);
3925        let e = _mm256_set1_pd(3.);
3926        assert_eq_m256d(r, e);
3927    }
3928
3929    #[simd_test(enable = "avx")]
3930    unsafe fn test_mm256_broadcast_ps() {
3931        let a = _mm_setr_ps(4., 3., 2., 5.);
3932        let r = _mm256_broadcast_ps(&a);
3933        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3934        assert_eq_m256(r, e);
3935    }
3936
3937    #[simd_test(enable = "avx")]
3938    unsafe fn test_mm256_broadcast_pd() {
3939        let a = _mm_setr_pd(4., 3.);
3940        let r = _mm256_broadcast_pd(&a);
3941        let e = _mm256_setr_pd(4., 3., 4., 3.);
3942        assert_eq_m256d(r, e);
3943    }
3944
3945    #[simd_test(enable = "avx")]
3946    unsafe fn test_mm256_insertf128_ps() {
3947        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3948        let b = _mm_setr_ps(4., 9., 16., 25.);
3949        let r = _mm256_insertf128_ps::<0>(a, b);
3950        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3951        assert_eq_m256(r, e);
3952    }
3953
3954    #[simd_test(enable = "avx")]
3955    unsafe fn test_mm256_insertf128_pd() {
3956        let a = _mm256_setr_pd(1., 2., 3., 4.);
3957        let b = _mm_setr_pd(5., 6.);
3958        let r = _mm256_insertf128_pd::<0>(a, b);
3959        let e = _mm256_setr_pd(5., 6., 3., 4.);
3960        assert_eq_m256d(r, e);
3961    }
3962
3963    #[simd_test(enable = "avx")]
3964    unsafe fn test_mm256_insertf128_si256() {
3965        let a = _mm256_setr_epi64x(1, 2, 3, 4);
3966        let b = _mm_setr_epi64x(5, 6);
3967        let r = _mm256_insertf128_si256::<0>(a, b);
3968        let e = _mm256_setr_epi64x(5, 6, 3, 4);
3969        assert_eq_m256i(r, e);
3970    }
3971
3972    #[simd_test(enable = "avx")]
3973    unsafe fn test_mm256_insert_epi8() {
3974        #[rustfmt::skip]
3975        let a = _mm256_setr_epi8(
3976            1, 2, 3, 4, 5, 6, 7, 8,
3977            9, 10, 11, 12, 13, 14, 15, 16,
3978            17, 18, 19, 20, 21, 22, 23, 24,
3979            25, 26, 27, 28, 29, 30, 31, 32,
3980        );
3981        let r = _mm256_insert_epi8::<31>(a, 0);
3982        #[rustfmt::skip]
3983        let e = _mm256_setr_epi8(
3984            1, 2, 3, 4, 5, 6, 7, 8,
3985            9, 10, 11, 12, 13, 14, 15, 16,
3986            17, 18, 19, 20, 21, 22, 23, 24,
3987            25, 26, 27, 28, 29, 30, 31, 0,
3988        );
3989        assert_eq_m256i(r, e);
3990    }
3991
3992    #[simd_test(enable = "avx")]
3993    unsafe fn test_mm256_insert_epi16() {
3994        #[rustfmt::skip]
3995        let a = _mm256_setr_epi16(
3996            0, 1, 2, 3, 4, 5, 6, 7,
3997            8, 9, 10, 11, 12, 13, 14, 15,
3998        );
3999        let r = _mm256_insert_epi16::<15>(a, 0);
4000        #[rustfmt::skip]
4001        let e = _mm256_setr_epi16(
4002            0, 1, 2, 3, 4, 5, 6, 7,
4003            8, 9, 10, 11, 12, 13, 14, 0,
4004        );
4005        assert_eq_m256i(r, e);
4006    }
4007
4008    #[simd_test(enable = "avx")]
4009    unsafe fn test_mm256_insert_epi32() {
4010        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4011        let r = _mm256_insert_epi32::<7>(a, 0);
4012        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4013        assert_eq_m256i(r, e);
4014    }
4015
4016    #[simd_test(enable = "avx")]
4017    unsafe fn test_mm256_load_pd() {
4018        let a = _mm256_setr_pd(1., 2., 3., 4.);
4019        let p = ptr::addr_of!(a) as *const f64;
4020        let r = _mm256_load_pd(p);
4021        let e = _mm256_setr_pd(1., 2., 3., 4.);
4022        assert_eq_m256d(r, e);
4023    }
4024
4025    #[simd_test(enable = "avx")]
4026    unsafe fn test_mm256_store_pd() {
4027        let a = _mm256_setr_pd(1., 2., 3., 4.);
4028        let mut r = _mm256_undefined_pd();
4029        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4030        assert_eq_m256d(r, a);
4031    }
4032
4033    #[simd_test(enable = "avx")]
4034    unsafe fn test_mm256_load_ps() {
4035        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4036        let p = ptr::addr_of!(a) as *const f32;
4037        let r = _mm256_load_ps(p);
4038        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4039        assert_eq_m256(r, e);
4040    }
4041
4042    #[simd_test(enable = "avx")]
4043    unsafe fn test_mm256_store_ps() {
4044        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4045        let mut r = _mm256_undefined_ps();
4046        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4047        assert_eq_m256(r, a);
4048    }
4049
4050    #[simd_test(enable = "avx")]
4051    unsafe fn test_mm256_loadu_pd() {
4052        let a = &[1.0f64, 2., 3., 4.];
4053        let p = a.as_ptr();
4054        let r = _mm256_loadu_pd(black_box(p));
4055        let e = _mm256_setr_pd(1., 2., 3., 4.);
4056        assert_eq_m256d(r, e);
4057    }
4058
4059    #[simd_test(enable = "avx")]
4060    unsafe fn test_mm256_storeu_pd() {
4061        let a = _mm256_set1_pd(9.);
4062        let mut r = _mm256_undefined_pd();
4063        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4064        assert_eq_m256d(r, a);
4065    }
4066
4067    #[simd_test(enable = "avx")]
4068    unsafe fn test_mm256_loadu_ps() {
4069        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4070        let p = a.as_ptr();
4071        let r = _mm256_loadu_ps(black_box(p));
4072        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4073        assert_eq_m256(r, e);
4074    }
4075
4076    #[simd_test(enable = "avx")]
4077    unsafe fn test_mm256_storeu_ps() {
4078        let a = _mm256_set1_ps(9.);
4079        let mut r = _mm256_undefined_ps();
4080        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4081        assert_eq_m256(r, a);
4082    }
4083
4084    #[simd_test(enable = "avx")]
4085    unsafe fn test_mm256_load_si256() {
4086        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4087        let p = ptr::addr_of!(a);
4088        let r = _mm256_load_si256(p);
4089        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4090        assert_eq_m256i(r, e);
4091    }
4092
4093    #[simd_test(enable = "avx")]
4094    unsafe fn test_mm256_store_si256() {
4095        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4096        let mut r = _mm256_undefined_si256();
4097        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4098        assert_eq_m256i(r, a);
4099    }
4100
4101    #[simd_test(enable = "avx")]
4102    unsafe fn test_mm256_loadu_si256() {
4103        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4104        let p = ptr::addr_of!(a);
4105        let r = _mm256_loadu_si256(black_box(p));
4106        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4107        assert_eq_m256i(r, e);
4108    }
4109
4110    #[simd_test(enable = "avx")]
4111    unsafe fn test_mm256_storeu_si256() {
4112        let a = _mm256_set1_epi8(9);
4113        let mut r = _mm256_undefined_si256();
4114        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4115        assert_eq_m256i(r, a);
4116    }
4117
4118    #[simd_test(enable = "avx")]
4119    unsafe fn test_mm256_maskload_pd() {
4120        let a = &[1.0f64, 2., 3., 4.];
4121        let p = a.as_ptr();
4122        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4123        let r = _mm256_maskload_pd(black_box(p), mask);
4124        let e = _mm256_setr_pd(0., 2., 0., 4.);
4125        assert_eq_m256d(r, e);
4126    }
4127
4128    #[simd_test(enable = "avx")]
4129    unsafe fn test_mm256_maskstore_pd() {
4130        let mut r = _mm256_set1_pd(0.);
4131        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4132        let a = _mm256_setr_pd(1., 2., 3., 4.);
4133        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4134        let e = _mm256_setr_pd(0., 2., 0., 4.);
4135        assert_eq_m256d(r, e);
4136    }
4137
4138    #[simd_test(enable = "avx")]
4139    unsafe fn test_mm_maskload_pd() {
4140        let a = &[1.0f64, 2.];
4141        let p = a.as_ptr();
4142        let mask = _mm_setr_epi64x(0, !0);
4143        let r = _mm_maskload_pd(black_box(p), mask);
4144        let e = _mm_setr_pd(0., 2.);
4145        assert_eq_m128d(r, e);
4146    }
4147
4148    #[simd_test(enable = "avx")]
4149    unsafe fn test_mm_maskstore_pd() {
4150        let mut r = _mm_set1_pd(0.);
4151        let mask = _mm_setr_epi64x(0, !0);
4152        let a = _mm_setr_pd(1., 2.);
4153        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4154        let e = _mm_setr_pd(0., 2.);
4155        assert_eq_m128d(r, e);
4156    }
4157
4158    #[simd_test(enable = "avx")]
4159    unsafe fn test_mm256_maskload_ps() {
4160        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4161        let p = a.as_ptr();
4162        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4163        let r = _mm256_maskload_ps(black_box(p), mask);
4164        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4165        assert_eq_m256(r, e);
4166    }
4167
4168    #[simd_test(enable = "avx")]
4169    unsafe fn test_mm256_maskstore_ps() {
4170        let mut r = _mm256_set1_ps(0.);
4171        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4172        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4173        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4174        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4175        assert_eq_m256(r, e);
4176    }
4177
4178    #[simd_test(enable = "avx")]
4179    unsafe fn test_mm_maskload_ps() {
4180        let a = &[1.0f32, 2., 3., 4.];
4181        let p = a.as_ptr();
4182        let mask = _mm_setr_epi32(0, !0, 0, !0);
4183        let r = _mm_maskload_ps(black_box(p), mask);
4184        let e = _mm_setr_ps(0., 2., 0., 4.);
4185        assert_eq_m128(r, e);
4186    }
4187
4188    #[simd_test(enable = "avx")]
4189    unsafe fn test_mm_maskstore_ps() {
4190        let mut r = _mm_set1_ps(0.);
4191        let mask = _mm_setr_epi32(0, !0, 0, !0);
4192        let a = _mm_setr_ps(1., 2., 3., 4.);
4193        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4194        let e = _mm_setr_ps(0., 2., 0., 4.);
4195        assert_eq_m128(r, e);
4196    }
4197
4198    #[simd_test(enable = "avx")]
4199    unsafe fn test_mm256_movehdup_ps() {
4200        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4201        let r = _mm256_movehdup_ps(a);
4202        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4203        assert_eq_m256(r, e);
4204    }
4205
4206    #[simd_test(enable = "avx")]
4207    unsafe fn test_mm256_moveldup_ps() {
4208        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4209        let r = _mm256_moveldup_ps(a);
4210        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4211        assert_eq_m256(r, e);
4212    }
4213
4214    #[simd_test(enable = "avx")]
4215    unsafe fn test_mm256_movedup_pd() {
4216        let a = _mm256_setr_pd(1., 2., 3., 4.);
4217        let r = _mm256_movedup_pd(a);
4218        let e = _mm256_setr_pd(1., 1., 3., 3.);
4219        assert_eq_m256d(r, e);
4220    }
4221
4222    #[simd_test(enable = "avx")]
4223    unsafe fn test_mm256_lddqu_si256() {
4224        #[rustfmt::skip]
4225        let a = _mm256_setr_epi8(
4226            1, 2, 3, 4, 5, 6, 7, 8,
4227            9, 10, 11, 12, 13, 14, 15, 16,
4228            17, 18, 19, 20, 21, 22, 23, 24,
4229            25, 26, 27, 28, 29, 30, 31, 32,
4230        );
4231        let p = ptr::addr_of!(a);
4232        let r = _mm256_lddqu_si256(black_box(p));
4233        #[rustfmt::skip]
4234        let e = _mm256_setr_epi8(
4235            1, 2, 3, 4, 5, 6, 7, 8,
4236            9, 10, 11, 12, 13, 14, 15, 16,
4237            17, 18, 19, 20, 21, 22, 23, 24,
4238            25, 26, 27, 28, 29, 30, 31, 32,
4239        );
4240        assert_eq_m256i(r, e);
4241    }
4242
4243    #[simd_test(enable = "avx")]
4244    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4245    unsafe fn test_mm256_stream_si256() {
4246        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4247        let mut r = _mm256_undefined_si256();
4248        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4249        assert_eq_m256i(r, a);
4250    }
4251
4252    #[simd_test(enable = "avx")]
4253    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4254    unsafe fn test_mm256_stream_pd() {
4255        #[repr(align(32))]
4256        struct Memory {
4257            pub data: [f64; 4],
4258        }
4259        let a = _mm256_set1_pd(7.0);
4260        let mut mem = Memory { data: [-1.0; 4] };
4261
4262        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4263        for i in 0..4 {
4264            assert_eq!(mem.data[i], get_m256d(a, i));
4265        }
4266    }
4267
4268    #[simd_test(enable = "avx")]
4269    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4270    unsafe fn test_mm256_stream_ps() {
4271        #[repr(align(32))]
4272        struct Memory {
4273            pub data: [f32; 8],
4274        }
4275        let a = _mm256_set1_ps(7.0);
4276        let mut mem = Memory { data: [-1.0; 8] };
4277
4278        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4279        for i in 0..8 {
4280            assert_eq!(mem.data[i], get_m256(a, i));
4281        }
4282    }
4283
4284    #[simd_test(enable = "avx")]
4285    unsafe fn test_mm256_rcp_ps() {
4286        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4287        let r = _mm256_rcp_ps(a);
4288        #[rustfmt::skip]
4289        let e = _mm256_setr_ps(
4290            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4291            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4292        );
4293        let rel_err = 0.00048828125;
4294        for i in 0..8 {
4295            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4296        }
4297    }
4298
4299    #[simd_test(enable = "avx")]
4300    unsafe fn test_mm256_rsqrt_ps() {
4301        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4302        let r = _mm256_rsqrt_ps(a);
4303        #[rustfmt::skip]
4304        let e = _mm256_setr_ps(
4305            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4306            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4307        );
4308        let rel_err = 0.00048828125;
4309        for i in 0..8 {
4310            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4311        }
4312    }
4313
4314    #[simd_test(enable = "avx")]
4315    unsafe fn test_mm256_unpackhi_pd() {
4316        let a = _mm256_setr_pd(1., 2., 3., 4.);
4317        let b = _mm256_setr_pd(5., 6., 7., 8.);
4318        let r = _mm256_unpackhi_pd(a, b);
4319        let e = _mm256_setr_pd(2., 6., 4., 8.);
4320        assert_eq_m256d(r, e);
4321    }
4322
4323    #[simd_test(enable = "avx")]
4324    unsafe fn test_mm256_unpackhi_ps() {
4325        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4326        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4327        let r = _mm256_unpackhi_ps(a, b);
4328        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4329        assert_eq_m256(r, e);
4330    }
4331
4332    #[simd_test(enable = "avx")]
4333    unsafe fn test_mm256_unpacklo_pd() {
4334        let a = _mm256_setr_pd(1., 2., 3., 4.);
4335        let b = _mm256_setr_pd(5., 6., 7., 8.);
4336        let r = _mm256_unpacklo_pd(a, b);
4337        let e = _mm256_setr_pd(1., 5., 3., 7.);
4338        assert_eq_m256d(r, e);
4339    }
4340
4341    #[simd_test(enable = "avx")]
4342    unsafe fn test_mm256_unpacklo_ps() {
4343        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4344        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4345        let r = _mm256_unpacklo_ps(a, b);
4346        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4347        assert_eq_m256(r, e);
4348    }
4349
4350    #[simd_test(enable = "avx")]
4351    unsafe fn test_mm256_testz_si256() {
4352        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4353        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4354        let r = _mm256_testz_si256(a, b);
4355        assert_eq!(r, 0);
4356        let b = _mm256_set1_epi64x(0);
4357        let r = _mm256_testz_si256(a, b);
4358        assert_eq!(r, 1);
4359    }
4360
4361    #[simd_test(enable = "avx")]
4362    unsafe fn test_mm256_testc_si256() {
4363        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4364        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4365        let r = _mm256_testc_si256(a, b);
4366        assert_eq!(r, 0);
4367        let b = _mm256_set1_epi64x(0);
4368        let r = _mm256_testc_si256(a, b);
4369        assert_eq!(r, 1);
4370    }
4371
4372    #[simd_test(enable = "avx")]
4373    unsafe fn test_mm256_testnzc_si256() {
4374        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4375        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4376        let r = _mm256_testnzc_si256(a, b);
4377        assert_eq!(r, 1);
4378        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4379        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4380        let r = _mm256_testnzc_si256(a, b);
4381        assert_eq!(r, 0);
4382    }
4383
4384    #[simd_test(enable = "avx")]
4385    unsafe fn test_mm256_testz_pd() {
4386        let a = _mm256_setr_pd(1., 2., 3., 4.);
4387        let b = _mm256_setr_pd(5., 6., 7., 8.);
4388        let r = _mm256_testz_pd(a, b);
4389        assert_eq!(r, 1);
4390        let a = _mm256_set1_pd(-1.);
4391        let r = _mm256_testz_pd(a, a);
4392        assert_eq!(r, 0);
4393    }
4394
4395    #[simd_test(enable = "avx")]
4396    unsafe fn test_mm256_testc_pd() {
4397        let a = _mm256_setr_pd(1., 2., 3., 4.);
4398        let b = _mm256_setr_pd(5., 6., 7., 8.);
4399        let r = _mm256_testc_pd(a, b);
4400        assert_eq!(r, 1);
4401        let a = _mm256_set1_pd(1.);
4402        let b = _mm256_set1_pd(-1.);
4403        let r = _mm256_testc_pd(a, b);
4404        assert_eq!(r, 0);
4405    }
4406
4407    #[simd_test(enable = "avx")]
4408    unsafe fn test_mm256_testnzc_pd() {
4409        let a = _mm256_setr_pd(1., 2., 3., 4.);
4410        let b = _mm256_setr_pd(5., 6., 7., 8.);
4411        let r = _mm256_testnzc_pd(a, b);
4412        assert_eq!(r, 0);
4413        let a = _mm256_setr_pd(1., -1., -1., -1.);
4414        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4415        let r = _mm256_testnzc_pd(a, b);
4416        assert_eq!(r, 1);
4417    }
4418
4419    #[simd_test(enable = "avx")]
4420    unsafe fn test_mm_testz_pd() {
4421        let a = _mm_setr_pd(1., 2.);
4422        let b = _mm_setr_pd(5., 6.);
4423        let r = _mm_testz_pd(a, b);
4424        assert_eq!(r, 1);
4425        let a = _mm_set1_pd(-1.);
4426        let r = _mm_testz_pd(a, a);
4427        assert_eq!(r, 0);
4428    }
4429
4430    #[simd_test(enable = "avx")]
4431    unsafe fn test_mm_testc_pd() {
4432        let a = _mm_setr_pd(1., 2.);
4433        let b = _mm_setr_pd(5., 6.);
4434        let r = _mm_testc_pd(a, b);
4435        assert_eq!(r, 1);
4436        let a = _mm_set1_pd(1.);
4437        let b = _mm_set1_pd(-1.);
4438        let r = _mm_testc_pd(a, b);
4439        assert_eq!(r, 0);
4440    }
4441
4442    #[simd_test(enable = "avx")]
4443    unsafe fn test_mm_testnzc_pd() {
4444        let a = _mm_setr_pd(1., 2.);
4445        let b = _mm_setr_pd(5., 6.);
4446        let r = _mm_testnzc_pd(a, b);
4447        assert_eq!(r, 0);
4448        let a = _mm_setr_pd(1., -1.);
4449        let b = _mm_setr_pd(-1., -1.);
4450        let r = _mm_testnzc_pd(a, b);
4451        assert_eq!(r, 1);
4452    }
4453
4454    #[simd_test(enable = "avx")]
4455    unsafe fn test_mm256_testz_ps() {
4456        let a = _mm256_set1_ps(1.);
4457        let r = _mm256_testz_ps(a, a);
4458        assert_eq!(r, 1);
4459        let a = _mm256_set1_ps(-1.);
4460        let r = _mm256_testz_ps(a, a);
4461        assert_eq!(r, 0);
4462    }
4463
4464    #[simd_test(enable = "avx")]
4465    unsafe fn test_mm256_testc_ps() {
4466        let a = _mm256_set1_ps(1.);
4467        let r = _mm256_testc_ps(a, a);
4468        assert_eq!(r, 1);
4469        let b = _mm256_set1_ps(-1.);
4470        let r = _mm256_testc_ps(a, b);
4471        assert_eq!(r, 0);
4472    }
4473
4474    #[simd_test(enable = "avx")]
4475    unsafe fn test_mm256_testnzc_ps() {
4476        let a = _mm256_set1_ps(1.);
4477        let r = _mm256_testnzc_ps(a, a);
4478        assert_eq!(r, 0);
4479        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4480        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4481        let r = _mm256_testnzc_ps(a, b);
4482        assert_eq!(r, 1);
4483    }
4484
4485    #[simd_test(enable = "avx")]
4486    unsafe fn test_mm_testz_ps() {
4487        let a = _mm_set1_ps(1.);
4488        let r = _mm_testz_ps(a, a);
4489        assert_eq!(r, 1);
4490        let a = _mm_set1_ps(-1.);
4491        let r = _mm_testz_ps(a, a);
4492        assert_eq!(r, 0);
4493    }
4494
4495    #[simd_test(enable = "avx")]
4496    unsafe fn test_mm_testc_ps() {
4497        let a = _mm_set1_ps(1.);
4498        let r = _mm_testc_ps(a, a);
4499        assert_eq!(r, 1);
4500        let b = _mm_set1_ps(-1.);
4501        let r = _mm_testc_ps(a, b);
4502        assert_eq!(r, 0);
4503    }
4504
4505    #[simd_test(enable = "avx")]
4506    unsafe fn test_mm_testnzc_ps() {
4507        let a = _mm_set1_ps(1.);
4508        let r = _mm_testnzc_ps(a, a);
4509        assert_eq!(r, 0);
4510        let a = _mm_setr_ps(1., -1., -1., -1.);
4511        let b = _mm_setr_ps(-1., -1., 1., 1.);
4512        let r = _mm_testnzc_ps(a, b);
4513        assert_eq!(r, 1);
4514    }
4515
4516    #[simd_test(enable = "avx")]
4517    unsafe fn test_mm256_movemask_pd() {
4518        let a = _mm256_setr_pd(1., -2., 3., -4.);
4519        let r = _mm256_movemask_pd(a);
4520        assert_eq!(r, 0xA);
4521    }
4522
4523    #[simd_test(enable = "avx")]
4524    unsafe fn test_mm256_movemask_ps() {
4525        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4526        let r = _mm256_movemask_ps(a);
4527        assert_eq!(r, 0xAA);
4528    }
4529
4530    #[simd_test(enable = "avx")]
4531    unsafe fn test_mm256_setzero_pd() {
4532        let r = _mm256_setzero_pd();
4533        assert_eq_m256d(r, _mm256_set1_pd(0.));
4534    }
4535
4536    #[simd_test(enable = "avx")]
4537    unsafe fn test_mm256_setzero_ps() {
4538        let r = _mm256_setzero_ps();
4539        assert_eq_m256(r, _mm256_set1_ps(0.));
4540    }
4541
4542    #[simd_test(enable = "avx")]
4543    unsafe fn test_mm256_setzero_si256() {
4544        let r = _mm256_setzero_si256();
4545        assert_eq_m256i(r, _mm256_set1_epi8(0));
4546    }
4547
4548    #[simd_test(enable = "avx")]
4549    unsafe fn test_mm256_set_pd() {
4550        let r = _mm256_set_pd(1., 2., 3., 4.);
4551        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4552    }
4553
4554    #[simd_test(enable = "avx")]
4555    unsafe fn test_mm256_set_ps() {
4556        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4557        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4558    }
4559
4560    #[simd_test(enable = "avx")]
4561    unsafe fn test_mm256_set_epi8() {
4562        #[rustfmt::skip]
4563        let r = _mm256_set_epi8(
4564            1, 2, 3, 4, 5, 6, 7, 8,
4565            9, 10, 11, 12, 13, 14, 15, 16,
4566            17, 18, 19, 20, 21, 22, 23, 24,
4567            25, 26, 27, 28, 29, 30, 31, 32,
4568        );
4569        #[rustfmt::skip]
4570        let e = _mm256_setr_epi8(
4571            32, 31, 30, 29, 28, 27, 26, 25,
4572            24, 23, 22, 21, 20, 19, 18, 17,
4573            16, 15, 14, 13, 12, 11, 10, 9,
4574            8, 7, 6, 5, 4, 3, 2, 1
4575        );
4576        assert_eq_m256i(r, e);
4577    }
4578
4579    #[simd_test(enable = "avx")]
4580    unsafe fn test_mm256_set_epi16() {
4581        #[rustfmt::skip]
4582        let r = _mm256_set_epi16(
4583            1, 2, 3, 4, 5, 6, 7, 8,
4584            9, 10, 11, 12, 13, 14, 15, 16,
4585        );
4586        #[rustfmt::skip]
4587        let e = _mm256_setr_epi16(
4588            16, 15, 14, 13, 12, 11, 10, 9, 8,
4589            7, 6, 5, 4, 3, 2, 1,
4590        );
4591        assert_eq_m256i(r, e);
4592    }
4593
4594    #[simd_test(enable = "avx")]
4595    unsafe fn test_mm256_set_epi32() {
4596        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4597        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4598    }
4599
4600    #[simd_test(enable = "avx")]
4601    unsafe fn test_mm256_set_epi64x() {
4602        let r = _mm256_set_epi64x(1, 2, 3, 4);
4603        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4604    }
4605
4606    #[simd_test(enable = "avx")]
4607    unsafe fn test_mm256_setr_pd() {
4608        let r = _mm256_setr_pd(1., 2., 3., 4.);
4609        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4610    }
4611
4612    #[simd_test(enable = "avx")]
4613    unsafe fn test_mm256_setr_ps() {
4614        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4615        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4616    }
4617
4618    #[simd_test(enable = "avx")]
4619    unsafe fn test_mm256_setr_epi8() {
4620        #[rustfmt::skip]
4621        let r = _mm256_setr_epi8(
4622            1, 2, 3, 4, 5, 6, 7, 8,
4623            9, 10, 11, 12, 13, 14, 15, 16,
4624            17, 18, 19, 20, 21, 22, 23, 24,
4625            25, 26, 27, 28, 29, 30, 31, 32,
4626        );
4627        #[rustfmt::skip]
4628        let e = _mm256_setr_epi8(
4629            1, 2, 3, 4, 5, 6, 7, 8,
4630            9, 10, 11, 12, 13, 14, 15, 16,
4631            17, 18, 19, 20, 21, 22, 23, 24,
4632            25, 26, 27, 28, 29, 30, 31, 32
4633        );
4634
4635        assert_eq_m256i(r, e);
4636    }
4637
4638    #[simd_test(enable = "avx")]
4639    unsafe fn test_mm256_setr_epi16() {
4640        #[rustfmt::skip]
4641        let r = _mm256_setr_epi16(
4642            1, 2, 3, 4, 5, 6, 7, 8,
4643            9, 10, 11, 12, 13, 14, 15, 16,
4644        );
4645        #[rustfmt::skip]
4646        let e = _mm256_setr_epi16(
4647            1, 2, 3, 4, 5, 6, 7, 8,
4648            9, 10, 11, 12, 13, 14, 15, 16,
4649        );
4650        assert_eq_m256i(r, e);
4651    }
4652
4653    #[simd_test(enable = "avx")]
4654    unsafe fn test_mm256_setr_epi32() {
4655        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4656        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4657    }
4658
4659    #[simd_test(enable = "avx")]
4660    unsafe fn test_mm256_setr_epi64x() {
4661        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4662        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4663    }
4664
4665    #[simd_test(enable = "avx")]
4666    unsafe fn test_mm256_set1_pd() {
4667        let r = _mm256_set1_pd(1.);
4668        assert_eq_m256d(r, _mm256_set1_pd(1.));
4669    }
4670
4671    #[simd_test(enable = "avx")]
4672    unsafe fn test_mm256_set1_ps() {
4673        let r = _mm256_set1_ps(1.);
4674        assert_eq_m256(r, _mm256_set1_ps(1.));
4675    }
4676
4677    #[simd_test(enable = "avx")]
4678    unsafe fn test_mm256_set1_epi8() {
4679        let r = _mm256_set1_epi8(1);
4680        assert_eq_m256i(r, _mm256_set1_epi8(1));
4681    }
4682
4683    #[simd_test(enable = "avx")]
4684    unsafe fn test_mm256_set1_epi16() {
4685        let r = _mm256_set1_epi16(1);
4686        assert_eq_m256i(r, _mm256_set1_epi16(1));
4687    }
4688
4689    #[simd_test(enable = "avx")]
4690    unsafe fn test_mm256_set1_epi32() {
4691        let r = _mm256_set1_epi32(1);
4692        assert_eq_m256i(r, _mm256_set1_epi32(1));
4693    }
4694
4695    #[simd_test(enable = "avx")]
4696    unsafe fn test_mm256_set1_epi64x() {
4697        let r = _mm256_set1_epi64x(1);
4698        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4699    }
4700
4701    #[simd_test(enable = "avx")]
4702    unsafe fn test_mm256_castpd_ps() {
4703        let a = _mm256_setr_pd(1., 2., 3., 4.);
4704        let r = _mm256_castpd_ps(a);
4705        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4706        assert_eq_m256(r, e);
4707    }
4708
4709    #[simd_test(enable = "avx")]
4710    unsafe fn test_mm256_castps_pd() {
4711        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4712        let r = _mm256_castps_pd(a);
4713        let e = _mm256_setr_pd(1., 2., 3., 4.);
4714        assert_eq_m256d(r, e);
4715    }
4716
4717    #[simd_test(enable = "avx")]
4718    unsafe fn test_mm256_castps_si256() {
4719        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4720        let r = _mm256_castps_si256(a);
4721        #[rustfmt::skip]
4722        let e = _mm256_setr_epi8(
4723            0, 0, -128, 63, 0, 0, 0, 64,
4724            0, 0, 64, 64, 0, 0, -128, 64,
4725            0, 0, -96, 64, 0, 0, -64, 64,
4726            0, 0, -32, 64, 0, 0, 0, 65,
4727        );
4728        assert_eq_m256i(r, e);
4729    }
4730
4731    #[simd_test(enable = "avx")]
4732    unsafe fn test_mm256_castsi256_ps() {
4733        #[rustfmt::skip]
4734        let a = _mm256_setr_epi8(
4735            0, 0, -128, 63, 0, 0, 0, 64,
4736            0, 0, 64, 64, 0, 0, -128, 64,
4737            0, 0, -96, 64, 0, 0, -64, 64,
4738            0, 0, -32, 64, 0, 0, 0, 65,
4739        );
4740        let r = _mm256_castsi256_ps(a);
4741        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4742        assert_eq_m256(r, e);
4743    }
4744
4745    #[simd_test(enable = "avx")]
4746    unsafe fn test_mm256_castpd_si256() {
4747        let a = _mm256_setr_pd(1., 2., 3., 4.);
4748        let r = _mm256_castpd_si256(a);
4749        assert_eq_m256d(transmute(r), a);
4750    }
4751
4752    #[simd_test(enable = "avx")]
4753    unsafe fn test_mm256_castsi256_pd() {
4754        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4755        let r = _mm256_castsi256_pd(a);
4756        assert_eq_m256d(r, transmute(a));
4757    }
4758
4759    #[simd_test(enable = "avx")]
4760    unsafe fn test_mm256_castps256_ps128() {
4761        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4762        let r = _mm256_castps256_ps128(a);
4763        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4764    }
4765
4766    #[simd_test(enable = "avx")]
4767    unsafe fn test_mm256_castpd256_pd128() {
4768        let a = _mm256_setr_pd(1., 2., 3., 4.);
4769        let r = _mm256_castpd256_pd128(a);
4770        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4771    }
4772
4773    #[simd_test(enable = "avx")]
4774    unsafe fn test_mm256_castsi256_si128() {
4775        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4776        let r = _mm256_castsi256_si128(a);
4777        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4778    }
4779
4780    #[simd_test(enable = "avx")]
4781    unsafe fn test_mm256_castps128_ps256() {
4782        let a = _mm_setr_ps(1., 2., 3., 4.);
4783        let r = _mm256_castps128_ps256(a);
4784        assert_eq_m128(_mm256_castps256_ps128(r), a);
4785    }
4786
4787    #[simd_test(enable = "avx")]
4788    unsafe fn test_mm256_castpd128_pd256() {
4789        let a = _mm_setr_pd(1., 2.);
4790        let r = _mm256_castpd128_pd256(a);
4791        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4792    }
4793
4794    #[simd_test(enable = "avx")]
4795    unsafe fn test_mm256_castsi128_si256() {
4796        let a = _mm_setr_epi32(1, 2, 3, 4);
4797        let r = _mm256_castsi128_si256(a);
4798        assert_eq_m128i(_mm256_castsi256_si128(r), a);
4799    }
4800
4801    #[simd_test(enable = "avx")]
4802    unsafe fn test_mm256_zextps128_ps256() {
4803        let a = _mm_setr_ps(1., 2., 3., 4.);
4804        let r = _mm256_zextps128_ps256(a);
4805        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4806        assert_eq_m256(r, e);
4807    }
4808
4809    #[simd_test(enable = "avx")]
4810    unsafe fn test_mm256_zextsi128_si256() {
4811        let a = _mm_setr_epi64x(1, 2);
4812        let r = _mm256_zextsi128_si256(a);
4813        let e = _mm256_setr_epi64x(1, 2, 0, 0);
4814        assert_eq_m256i(r, e);
4815    }
4816
4817    #[simd_test(enable = "avx")]
4818    unsafe fn test_mm256_zextpd128_pd256() {
4819        let a = _mm_setr_pd(1., 2.);
4820        let r = _mm256_zextpd128_pd256(a);
4821        let e = _mm256_setr_pd(1., 2., 0., 0.);
4822        assert_eq_m256d(r, e);
4823    }
4824
4825    #[simd_test(enable = "avx")]
4826    unsafe fn test_mm256_set_m128() {
4827        let hi = _mm_setr_ps(5., 6., 7., 8.);
4828        let lo = _mm_setr_ps(1., 2., 3., 4.);
4829        let r = _mm256_set_m128(hi, lo);
4830        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4831        assert_eq_m256(r, e);
4832    }
4833
4834    #[simd_test(enable = "avx")]
4835    unsafe fn test_mm256_set_m128d() {
4836        let hi = _mm_setr_pd(3., 4.);
4837        let lo = _mm_setr_pd(1., 2.);
4838        let r = _mm256_set_m128d(hi, lo);
4839        let e = _mm256_setr_pd(1., 2., 3., 4.);
4840        assert_eq_m256d(r, e);
4841    }
4842
4843    #[simd_test(enable = "avx")]
4844    unsafe fn test_mm256_set_m128i() {
4845        #[rustfmt::skip]
4846        let hi = _mm_setr_epi8(
4847            17, 18, 19, 20,
4848            21, 22, 23, 24,
4849            25, 26, 27, 28,
4850            29, 30, 31, 32,
4851        );
4852        #[rustfmt::skip]
4853        let lo = _mm_setr_epi8(
4854            1, 2, 3, 4,
4855            5, 6, 7, 8,
4856            9, 10, 11, 12,
4857            13, 14, 15, 16,
4858        );
4859        let r = _mm256_set_m128i(hi, lo);
4860        #[rustfmt::skip]
4861        let e = _mm256_setr_epi8(
4862            1, 2, 3, 4, 5, 6, 7, 8,
4863            9, 10, 11, 12, 13, 14, 15, 16,
4864            17, 18, 19, 20, 21, 22, 23, 24,
4865            25, 26, 27, 28, 29, 30, 31, 32,
4866        );
4867        assert_eq_m256i(r, e);
4868    }
4869
4870    #[simd_test(enable = "avx")]
4871    unsafe fn test_mm256_setr_m128() {
4872        let lo = _mm_setr_ps(1., 2., 3., 4.);
4873        let hi = _mm_setr_ps(5., 6., 7., 8.);
4874        let r = _mm256_setr_m128(lo, hi);
4875        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4876        assert_eq_m256(r, e);
4877    }
4878
4879    #[simd_test(enable = "avx")]
4880    unsafe fn test_mm256_setr_m128d() {
4881        let lo = _mm_setr_pd(1., 2.);
4882        let hi = _mm_setr_pd(3., 4.);
4883        let r = _mm256_setr_m128d(lo, hi);
4884        let e = _mm256_setr_pd(1., 2., 3., 4.);
4885        assert_eq_m256d(r, e);
4886    }
4887
4888    #[simd_test(enable = "avx")]
4889    unsafe fn test_mm256_setr_m128i() {
4890        #[rustfmt::skip]
4891        let lo = _mm_setr_epi8(
4892            1, 2, 3, 4,
4893            5, 6, 7, 8,
4894            9, 10, 11, 12,
4895            13, 14, 15, 16,
4896        );
4897        #[rustfmt::skip]
4898        let hi = _mm_setr_epi8(
4899            17, 18, 19, 20, 21, 22, 23, 24,
4900            25, 26, 27, 28, 29, 30, 31, 32,
4901        );
4902        let r = _mm256_setr_m128i(lo, hi);
4903        #[rustfmt::skip]
4904        let e = _mm256_setr_epi8(
4905            1, 2, 3, 4, 5, 6, 7, 8,
4906            9, 10, 11, 12, 13, 14, 15, 16,
4907            17, 18, 19, 20, 21, 22, 23, 24,
4908            25, 26, 27, 28, 29, 30, 31, 32,
4909        );
4910        assert_eq_m256i(r, e);
4911    }
4912
4913    #[simd_test(enable = "avx")]
4914    unsafe fn test_mm256_loadu2_m128() {
4915        let hi = &[5., 6., 7., 8.];
4916        let hiaddr = hi.as_ptr();
4917        let lo = &[1., 2., 3., 4.];
4918        let loaddr = lo.as_ptr();
4919        let r = _mm256_loadu2_m128(hiaddr, loaddr);
4920        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4921        assert_eq_m256(r, e);
4922    }
4923
4924    #[simd_test(enable = "avx")]
4925    unsafe fn test_mm256_loadu2_m128d() {
4926        let hi = &[3., 4.];
4927        let hiaddr = hi.as_ptr();
4928        let lo = &[1., 2.];
4929        let loaddr = lo.as_ptr();
4930        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4931        let e = _mm256_setr_pd(1., 2., 3., 4.);
4932        assert_eq_m256d(r, e);
4933    }
4934
4935    #[simd_test(enable = "avx")]
4936    unsafe fn test_mm256_loadu2_m128i() {
4937        #[rustfmt::skip]
4938        let hi = _mm_setr_epi8(
4939            17, 18, 19, 20, 21, 22, 23, 24,
4940            25, 26, 27, 28, 29, 30, 31, 32,
4941        );
4942        #[rustfmt::skip]
4943        let lo = _mm_setr_epi8(
4944            1, 2, 3, 4, 5, 6, 7, 8,
4945            9, 10, 11, 12, 13, 14, 15, 16,
4946        );
4947        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
4948        #[rustfmt::skip]
4949        let e = _mm256_setr_epi8(
4950            1, 2, 3, 4, 5, 6, 7, 8,
4951            9, 10, 11, 12, 13, 14, 15, 16,
4952            17, 18, 19, 20, 21, 22, 23, 24,
4953            25, 26, 27, 28, 29, 30, 31, 32,
4954        );
4955        assert_eq_m256i(r, e);
4956    }
4957
4958    #[simd_test(enable = "avx")]
4959    unsafe fn test_mm256_storeu2_m128() {
4960        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4961        let mut hi = _mm_undefined_ps();
4962        let mut lo = _mm_undefined_ps();
4963        _mm256_storeu2_m128(
4964            ptr::addr_of_mut!(hi) as *mut f32,
4965            ptr::addr_of_mut!(lo) as *mut f32,
4966            a,
4967        );
4968        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4969        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4970    }
4971
4972    #[simd_test(enable = "avx")]
4973    unsafe fn test_mm256_storeu2_m128d() {
4974        let a = _mm256_setr_pd(1., 2., 3., 4.);
4975        let mut hi = _mm_undefined_pd();
4976        let mut lo = _mm_undefined_pd();
4977        _mm256_storeu2_m128d(
4978            ptr::addr_of_mut!(hi) as *mut f64,
4979            ptr::addr_of_mut!(lo) as *mut f64,
4980            a,
4981        );
4982        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4983        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4984    }
4985
4986    #[simd_test(enable = "avx")]
4987    unsafe fn test_mm256_storeu2_m128i() {
4988        #[rustfmt::skip]
4989        let a = _mm256_setr_epi8(
4990            1, 2, 3, 4, 5, 6, 7, 8,
4991            9, 10, 11, 12, 13, 14, 15, 16,
4992            17, 18, 19, 20, 21, 22, 23, 24,
4993            25, 26, 27, 28, 29, 30, 31, 32,
4994        );
4995        let mut hi = _mm_undefined_si128();
4996        let mut lo = _mm_undefined_si128();
4997        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
4998        #[rustfmt::skip]
4999        let e_hi = _mm_setr_epi8(
5000            17, 18, 19, 20, 21, 22, 23, 24,
5001            25, 26, 27, 28, 29, 30, 31, 32
5002        );
5003        #[rustfmt::skip]
5004        let e_lo = _mm_setr_epi8(
5005            1, 2, 3, 4, 5, 6, 7, 8,
5006            9, 10, 11, 12, 13, 14, 15, 16
5007        );
5008
5009        assert_eq_m128i(hi, e_hi);
5010        assert_eq_m128i(lo, e_lo);
5011    }
5012
5013    #[simd_test(enable = "avx")]
5014    unsafe fn test_mm256_cvtss_f32() {
5015        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5016        let r = _mm256_cvtss_f32(a);
5017        assert_eq!(r, 1.);
5018    }
5019}