core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34    unsafe { simd_add(a, b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46    unsafe { simd_add(a, b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59    unsafe {
60        let a: u64x4 = transmute(a);
61        let b: u64x4 = transmute(b);
62        transmute(simd_and(a, b))
63    }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75    unsafe {
76        let a: u32x8 = transmute(a);
77        let b: u32x8 = transmute(b);
78        transmute(simd_and(a, b))
79    }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92    unsafe {
93        let a: u64x4 = transmute(a);
94        let b: u64x4 = transmute(b);
95        transmute(simd_or(a, b))
96    }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108    unsafe {
109        let a: u32x8 = transmute(a);
110        let b: u32x8 = transmute(b);
111        transmute(simd_or(a, b))
112    }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125    static_assert_uimm_bits!(MASK, 8);
126    unsafe {
127        simd_shuffle!(
128            a,
129            b,
130            [
131                MASK as u32 & 0b1,
132                ((MASK as u32 >> 1) & 0b1) + 4,
133                ((MASK as u32 >> 2) & 0b1) + 2,
134                ((MASK as u32 >> 3) & 0b1) + 6,
135            ],
136        )
137    }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150    static_assert_uimm_bits!(MASK, 8);
151    unsafe {
152        simd_shuffle!(
153            a,
154            b,
155            [
156                MASK as u32 & 0b11,
157                (MASK as u32 >> 2) & 0b11,
158                ((MASK as u32 >> 4) & 0b11) + 8,
159                ((MASK as u32 >> 6) & 0b11) + 8,
160                (MASK as u32 & 0b11) + 4,
161                ((MASK as u32 >> 2) & 0b11) + 4,
162                ((MASK as u32 >> 4) & 0b11) + 12,
163                ((MASK as u32 >> 6) & 0b11) + 12,
164            ],
165        )
166    }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178    unsafe {
179        let a: u64x4 = transmute(a);
180        let b: u64x4 = transmute(b);
181        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
182    }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195    unsafe {
196        let a: u32x8 = transmute(a);
197        let b: u32x8 = transmute(b);
198        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
199    }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211    unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223    unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235    unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247    unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259    unsafe { simd_mul(a, b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271    unsafe { simd_mul(a, b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283    unsafe {
284        let a = a.as_f64x4();
285        let b = b.as_f64x4();
286        let add = simd_add(a, b);
287        let sub = simd_sub(a, b);
288        simd_shuffle!(add, sub, [4, 1, 6, 3])
289    }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301    unsafe {
302        let a = a.as_f32x8();
303        let b = b.as_f32x8();
304        let add = simd_add(a, b);
305        let sub = simd_sub(a, b);
306        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307    }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319    unsafe { simd_sub(a, b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331    unsafe { simd_sub(a, b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343    unsafe { simd_div(a, b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355    unsafe { simd_div(a, b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377    static_assert_uimm_bits!(ROUNDING, 4);
378    unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390    unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402    unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424    static_assert_uimm_bits!(ROUNDING, 4);
425    unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437    unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449    unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461    unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473    unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489    static_assert_uimm_bits!(IMM4, 4);
490    unsafe {
491        simd_shuffle!(
492            a,
493            b,
494            [
495                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499            ],
500        )
501    }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514    static_assert_uimm_bits!(IMM8, 8);
515    unsafe {
516        simd_shuffle!(
517            a,
518            b,
519            [
520                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528            ],
529        )
530    }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542    unsafe {
543        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
544        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
545    }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557    unsafe {
558        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
559        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
560    }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566///  using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575    static_assert_uimm_bits!(IMM8, 8);
576    unsafe { vdpps(a, b, IMM8 as i8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590    unsafe { vhaddpd(a, b) }
591}
592
593/// Horizontal addition of adjacent pairs in the two packed vectors
594/// of 8 32-bit floating points `a` and `b`.
595/// In the result, sums of elements from `a` are returned in locations of
596/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
597/// 2, 3, 6, 7.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
600#[inline]
601#[target_feature(enable = "avx")]
602#[cfg_attr(test, assert_instr(vhaddps))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
605    unsafe { vhaddps(a, b) }
606}
607
608/// Horizontal subtraction of adjacent pairs in the two packed vectors
609/// of 4 64-bit floating points `a` and `b`.
610/// In the result, sums of elements from `a` are returned in even locations,
611/// while sums of elements from `b` are returned in odd locations.
612///
613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
614#[inline]
615#[target_feature(enable = "avx")]
616#[cfg_attr(test, assert_instr(vhsubpd))]
617#[stable(feature = "simd_x86", since = "1.27.0")]
618pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
619    unsafe { vhsubpd(a, b) }
620}
621
622/// Horizontal subtraction of adjacent pairs in the two packed vectors
623/// of 8 32-bit floating points `a` and `b`.
624/// In the result, sums of elements from `a` are returned in locations of
625/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
626/// 2, 3, 6, 7.
627///
628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
629#[inline]
630#[target_feature(enable = "avx")]
631#[cfg_attr(test, assert_instr(vhsubps))]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
634    unsafe { vhsubps(a, b) }
635}
636
637/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
638/// elements in `a` and `b`.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vxorp))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
646    unsafe {
647        let a: u64x4 = transmute(a);
648        let b: u64x4 = transmute(b);
649        transmute(simd_xor(a, b))
650    }
651}
652
653/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorps))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
662    unsafe {
663        let a: u32x8 = transmute(a);
664        let b: u32x8 = transmute(b);
665        transmute(simd_xor(a, b))
666    }
667}
668
669/// Equal (ordered, non-signaling)
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub const _CMP_EQ_OQ: i32 = 0x00;
672/// Less-than (ordered, signaling)
673#[stable(feature = "simd_x86", since = "1.27.0")]
674pub const _CMP_LT_OS: i32 = 0x01;
675/// Less-than-or-equal (ordered, signaling)
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub const _CMP_LE_OS: i32 = 0x02;
678/// Unordered (non-signaling)
679#[stable(feature = "simd_x86", since = "1.27.0")]
680pub const _CMP_UNORD_Q: i32 = 0x03;
681/// Not-equal (unordered, non-signaling)
682#[stable(feature = "simd_x86", since = "1.27.0")]
683pub const _CMP_NEQ_UQ: i32 = 0x04;
684/// Not-less-than (unordered, signaling)
685#[stable(feature = "simd_x86", since = "1.27.0")]
686pub const _CMP_NLT_US: i32 = 0x05;
687/// Not-less-than-or-equal (unordered, signaling)
688#[stable(feature = "simd_x86", since = "1.27.0")]
689pub const _CMP_NLE_US: i32 = 0x06;
690/// Ordered (non-signaling)
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub const _CMP_ORD_Q: i32 = 0x07;
693/// Equal (unordered, non-signaling)
694#[stable(feature = "simd_x86", since = "1.27.0")]
695pub const _CMP_EQ_UQ: i32 = 0x08;
696/// Not-greater-than-or-equal (unordered, signaling)
697#[stable(feature = "simd_x86", since = "1.27.0")]
698pub const _CMP_NGE_US: i32 = 0x09;
699/// Not-greater-than (unordered, signaling)
700#[stable(feature = "simd_x86", since = "1.27.0")]
701pub const _CMP_NGT_US: i32 = 0x0a;
702/// False (ordered, non-signaling)
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub const _CMP_FALSE_OQ: i32 = 0x0b;
705/// Not-equal (ordered, non-signaling)
706#[stable(feature = "simd_x86", since = "1.27.0")]
707pub const _CMP_NEQ_OQ: i32 = 0x0c;
708/// Greater-than-or-equal (ordered, signaling)
709#[stable(feature = "simd_x86", since = "1.27.0")]
710pub const _CMP_GE_OS: i32 = 0x0d;
711/// Greater-than (ordered, signaling)
712#[stable(feature = "simd_x86", since = "1.27.0")]
713pub const _CMP_GT_OS: i32 = 0x0e;
714/// True (unordered, non-signaling)
715#[stable(feature = "simd_x86", since = "1.27.0")]
716pub const _CMP_TRUE_UQ: i32 = 0x0f;
717/// Equal (ordered, signaling)
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub const _CMP_EQ_OS: i32 = 0x10;
720/// Less-than (ordered, non-signaling)
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub const _CMP_LT_OQ: i32 = 0x11;
723/// Less-than-or-equal (ordered, non-signaling)
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub const _CMP_LE_OQ: i32 = 0x12;
726/// Unordered (signaling)
727#[stable(feature = "simd_x86", since = "1.27.0")]
728pub const _CMP_UNORD_S: i32 = 0x13;
729/// Not-equal (unordered, signaling)
730#[stable(feature = "simd_x86", since = "1.27.0")]
731pub const _CMP_NEQ_US: i32 = 0x14;
732/// Not-less-than (unordered, non-signaling)
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub const _CMP_NLT_UQ: i32 = 0x15;
735/// Not-less-than-or-equal (unordered, non-signaling)
736#[stable(feature = "simd_x86", since = "1.27.0")]
737pub const _CMP_NLE_UQ: i32 = 0x16;
738/// Ordered (signaling)
739#[stable(feature = "simd_x86", since = "1.27.0")]
740pub const _CMP_ORD_S: i32 = 0x17;
741/// Equal (unordered, signaling)
742#[stable(feature = "simd_x86", since = "1.27.0")]
743pub const _CMP_EQ_US: i32 = 0x18;
744/// Not-greater-than-or-equal (unordered, non-signaling)
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub const _CMP_NGE_UQ: i32 = 0x19;
747/// Not-greater-than (unordered, non-signaling)
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub const _CMP_NGT_UQ: i32 = 0x1a;
750/// False (ordered, signaling)
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub const _CMP_FALSE_OS: i32 = 0x1b;
753/// Not-equal (ordered, signaling)
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub const _CMP_NEQ_OS: i32 = 0x1c;
756/// Greater-than-or-equal (ordered, non-signaling)
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub const _CMP_GE_OQ: i32 = 0x1d;
759/// Greater-than (ordered, non-signaling)
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub const _CMP_GT_OQ: i32 = 0x1e;
762/// True (unordered, signaling)
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub const _CMP_TRUE_US: i32 = 0x1f;
765
766/// Compares packed double-precision (64-bit) floating-point
767/// elements in `a` and `b` based on the comparison operand
768/// specified by `IMM5`.
769///
770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
771#[inline]
772#[target_feature(enable = "avx")]
773#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
774#[rustc_legacy_const_generics(2)]
775#[stable(feature = "simd_x86", since = "1.27.0")]
776pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
777    static_assert_uimm_bits!(IMM5, 5);
778    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
779}
780
781/// Compares packed double-precision (64-bit) floating-point
782/// elements in `a` and `b` based on the comparison operand
783/// specified by `IMM5`.
784///
785/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
786#[inline]
787#[target_feature(enable = "avx")]
788#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
789#[rustc_legacy_const_generics(2)]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
792    static_assert_uimm_bits!(IMM5, 5);
793    unsafe { vcmppd256(a, b, IMM5 as u8) }
794}
795
796/// Compares packed single-precision (32-bit) floating-point
797/// elements in `a` and `b` based on the comparison operand
798/// specified by `IMM5`.
799///
800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
801#[inline]
802#[target_feature(enable = "avx")]
803#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
804#[rustc_legacy_const_generics(2)]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
807    static_assert_uimm_bits!(IMM5, 5);
808    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
809}
810
811/// Compares packed single-precision (32-bit) floating-point
812/// elements in `a` and `b` based on the comparison operand
813/// specified by `IMM5`.
814///
815/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
816#[inline]
817#[target_feature(enable = "avx")]
818#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
819#[rustc_legacy_const_generics(2)]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
822    static_assert_uimm_bits!(IMM5, 5);
823    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
824}
825
826/// Compares the lower double-precision (64-bit) floating-point element in
827/// `a` and `b` based on the comparison operand specified by `IMM5`,
828/// store the result in the lower element of returned vector,
829/// and copies the upper element from `a` to the upper element of returned
830/// vector.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
833#[inline]
834#[target_feature(enable = "avx")]
835#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
836#[rustc_legacy_const_generics(2)]
837#[stable(feature = "simd_x86", since = "1.27.0")]
838pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
839    static_assert_uimm_bits!(IMM5, 5);
840    unsafe { vcmpsd(a, b, IMM5 as i8) }
841}
842
843/// Compares the lower single-precision (32-bit) floating-point element in
844/// `a` and `b` based on the comparison operand specified by `IMM5`,
845/// store the result in the lower element of returned vector,
846/// and copies the upper 3 packed elements from `a` to the upper elements of
847/// returned vector.
848///
849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
850#[inline]
851#[target_feature(enable = "avx")]
852#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
853#[rustc_legacy_const_generics(2)]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
856    static_assert_uimm_bits!(IMM5, 5);
857    unsafe { vcmpss(a, b, IMM5 as i8) }
858}
859
860/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
861/// floating-point elements.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
864#[inline]
865#[target_feature(enable = "avx")]
866#[cfg_attr(test, assert_instr(vcvtdq2pd))]
867#[stable(feature = "simd_x86", since = "1.27.0")]
868pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
869    unsafe { simd_cast(a.as_i32x4()) }
870}
871
872/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
873/// floating-point elements.
874///
875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
876#[inline]
877#[target_feature(enable = "avx")]
878#[cfg_attr(test, assert_instr(vcvtdq2ps))]
879#[stable(feature = "simd_x86", since = "1.27.0")]
880pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
881    unsafe { simd_cast(a.as_i32x8()) }
882}
883
884/// Converts packed double-precision (64-bit) floating-point elements in `a`
885/// to packed single-precision (32-bit) floating-point elements.
886///
887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
888#[inline]
889#[target_feature(enable = "avx")]
890#[cfg_attr(test, assert_instr(vcvtpd2ps))]
891#[stable(feature = "simd_x86", since = "1.27.0")]
892pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
893    unsafe { simd_cast(a) }
894}
895
896/// Converts packed single-precision (32-bit) floating-point elements in `a`
897/// to packed 32-bit integers.
898///
899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
900#[inline]
901#[target_feature(enable = "avx")]
902#[cfg_attr(test, assert_instr(vcvtps2dq))]
903#[stable(feature = "simd_x86", since = "1.27.0")]
904pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
905    unsafe { transmute(vcvtps2dq(a)) }
906}
907
908/// Converts packed single-precision (32-bit) floating-point elements in `a`
909/// to packed double-precision (64-bit) floating-point elements.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
912#[inline]
913#[target_feature(enable = "avx")]
914#[cfg_attr(test, assert_instr(vcvtps2pd))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
917    unsafe { simd_cast(a) }
918}
919
920/// Returns the first element of the input vector of `[4 x double]`.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
923#[inline]
924#[target_feature(enable = "avx")]
925//#[cfg_attr(test, assert_instr(movsd))] FIXME
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
928    unsafe { simd_extract!(a, 0) }
929}
930
931/// Converts packed double-precision (64-bit) floating-point elements in `a`
932/// to packed 32-bit integers with truncation.
933///
934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
935#[inline]
936#[target_feature(enable = "avx")]
937#[cfg_attr(test, assert_instr(vcvttpd2dq))]
938#[stable(feature = "simd_x86", since = "1.27.0")]
939pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
940    unsafe { transmute(vcvttpd2dq(a)) }
941}
942
943/// Converts packed double-precision (64-bit) floating-point elements in `a`
944/// to packed 32-bit integers.
945///
946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
947#[inline]
948#[target_feature(enable = "avx")]
949#[cfg_attr(test, assert_instr(vcvtpd2dq))]
950#[stable(feature = "simd_x86", since = "1.27.0")]
951pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
952    unsafe { transmute(vcvtpd2dq(a)) }
953}
954
955/// Converts packed single-precision (32-bit) floating-point elements in `a`
956/// to packed 32-bit integers with truncation.
957///
958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
959#[inline]
960#[target_feature(enable = "avx")]
961#[cfg_attr(test, assert_instr(vcvttps2dq))]
962#[stable(feature = "simd_x86", since = "1.27.0")]
963pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
964    unsafe { transmute(vcvttps2dq(a)) }
965}
966
967/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
968/// floating-point elements) from `a`, selected with `imm8`.
969///
970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
971#[inline]
972#[target_feature(enable = "avx")]
973#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
974#[rustc_legacy_const_generics(1)]
975#[stable(feature = "simd_x86", since = "1.27.0")]
976pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
977    static_assert_uimm_bits!(IMM1, 1);
978    unsafe {
979        simd_shuffle!(
980            a,
981            _mm256_undefined_ps(),
982            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
983        )
984    }
985}
986
987/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
988/// floating-point elements) from `a`, selected with `imm8`.
989///
990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
991#[inline]
992#[target_feature(enable = "avx")]
993#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
994#[rustc_legacy_const_generics(1)]
995#[stable(feature = "simd_x86", since = "1.27.0")]
996pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
997    static_assert_uimm_bits!(IMM1, 1);
998    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
999}
1000
1001/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1004#[inline]
1005#[target_feature(enable = "avx")]
1006#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1007#[rustc_legacy_const_generics(1)]
1008#[stable(feature = "simd_x86", since = "1.27.0")]
1009pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1010    static_assert_uimm_bits!(IMM1, 1);
1011    unsafe {
1012        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1013        transmute(dst)
1014    }
1015}
1016
1017/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1018///
1019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1020#[inline]
1021#[target_feature(enable = "avx")]
1022// This intrinsic has no corresponding instruction.
1023#[rustc_legacy_const_generics(1)]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1026    static_assert_uimm_bits!(INDEX, 3);
1027    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1028}
1029
1030/// Returns the first element of the input vector of `[8 x i32]`.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1033#[inline]
1034#[target_feature(enable = "avx")]
1035#[stable(feature = "simd_x86", since = "1.27.0")]
1036pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1037    unsafe { simd_extract!(a.as_i32x8(), 0) }
1038}
1039
1040/// Zeroes the contents of all XMM or YMM registers.
1041///
1042/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1043#[inline]
1044#[target_feature(enable = "avx")]
1045#[cfg_attr(test, assert_instr(vzeroall))]
1046#[stable(feature = "simd_x86", since = "1.27.0")]
1047pub fn _mm256_zeroall() {
1048    unsafe { vzeroall() }
1049}
1050
1051/// Zeroes the upper 128 bits of all YMM registers;
1052/// the lower 128-bits of the registers are unmodified.
1053///
1054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1055#[inline]
1056#[target_feature(enable = "avx")]
1057#[cfg_attr(test, assert_instr(vzeroupper))]
1058#[stable(feature = "simd_x86", since = "1.27.0")]
1059pub fn _mm256_zeroupper() {
1060    unsafe { vzeroupper() }
1061}
1062
1063/// Shuffles single-precision (32-bit) floating-point elements in `a`
1064/// within 128-bit lanes using the control in `b`.
1065///
1066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1067#[inline]
1068#[target_feature(enable = "avx")]
1069#[cfg_attr(test, assert_instr(vpermilps))]
1070#[stable(feature = "simd_x86", since = "1.27.0")]
1071pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1072    unsafe { vpermilps256(a, b.as_i32x8()) }
1073}
1074
1075/// Shuffles single-precision (32-bit) floating-point elements in `a`
1076/// using the control in `b`.
1077///
1078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1079#[inline]
1080#[target_feature(enable = "avx")]
1081#[cfg_attr(test, assert_instr(vpermilps))]
1082#[stable(feature = "simd_x86", since = "1.27.0")]
1083pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1084    unsafe { vpermilps(a, b.as_i32x4()) }
1085}
1086
1087/// Shuffles single-precision (32-bit) floating-point elements in `a`
1088/// within 128-bit lanes using the control in `imm8`.
1089///
1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1091#[inline]
1092#[target_feature(enable = "avx")]
1093#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1094#[rustc_legacy_const_generics(1)]
1095#[stable(feature = "simd_x86", since = "1.27.0")]
1096pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1097    static_assert_uimm_bits!(IMM8, 8);
1098    unsafe {
1099        simd_shuffle!(
1100            a,
1101            _mm256_undefined_ps(),
1102            [
1103                (IMM8 as u32 >> 0) & 0b11,
1104                (IMM8 as u32 >> 2) & 0b11,
1105                (IMM8 as u32 >> 4) & 0b11,
1106                (IMM8 as u32 >> 6) & 0b11,
1107                ((IMM8 as u32 >> 0) & 0b11) + 4,
1108                ((IMM8 as u32 >> 2) & 0b11) + 4,
1109                ((IMM8 as u32 >> 4) & 0b11) + 4,
1110                ((IMM8 as u32 >> 6) & 0b11) + 4,
1111            ],
1112        )
1113    }
1114}
1115
1116/// Shuffles single-precision (32-bit) floating-point elements in `a`
1117/// using the control in `imm8`.
1118///
1119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1120#[inline]
1121#[target_feature(enable = "avx")]
1122#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1123#[rustc_legacy_const_generics(1)]
1124#[stable(feature = "simd_x86", since = "1.27.0")]
1125pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1126    static_assert_uimm_bits!(IMM8, 8);
1127    unsafe {
1128        simd_shuffle!(
1129            a,
1130            _mm_undefined_ps(),
1131            [
1132                (IMM8 as u32 >> 0) & 0b11,
1133                (IMM8 as u32 >> 2) & 0b11,
1134                (IMM8 as u32 >> 4) & 0b11,
1135                (IMM8 as u32 >> 6) & 0b11,
1136            ],
1137        )
1138    }
1139}
1140
1141/// Shuffles double-precision (64-bit) floating-point elements in `a`
1142/// within 256-bit lanes using the control in `b`.
1143///
1144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1145#[inline]
1146#[target_feature(enable = "avx")]
1147#[cfg_attr(test, assert_instr(vpermilpd))]
1148#[stable(feature = "simd_x86", since = "1.27.0")]
1149pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1150    unsafe { vpermilpd256(a, b.as_i64x4()) }
1151}
1152
1153/// Shuffles double-precision (64-bit) floating-point elements in `a`
1154/// using the control in `b`.
1155///
1156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1157#[inline]
1158#[target_feature(enable = "avx")]
1159#[cfg_attr(test, assert_instr(vpermilpd))]
1160#[stable(feature = "simd_x86", since = "1.27.0")]
1161pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1162    unsafe { vpermilpd(a, b.as_i64x2()) }
1163}
1164
1165/// Shuffles double-precision (64-bit) floating-point elements in `a`
1166/// within 128-bit lanes using the control in `imm8`.
1167///
1168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1169#[inline]
1170#[target_feature(enable = "avx")]
1171#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1172#[rustc_legacy_const_generics(1)]
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1175    static_assert_uimm_bits!(IMM4, 4);
1176    unsafe {
1177        simd_shuffle!(
1178            a,
1179            _mm256_undefined_pd(),
1180            [
1181                ((IMM4 as u32 >> 0) & 1),
1182                ((IMM4 as u32 >> 1) & 1),
1183                ((IMM4 as u32 >> 2) & 1) + 2,
1184                ((IMM4 as u32 >> 3) & 1) + 2,
1185            ],
1186        )
1187    }
1188}
1189
1190/// Shuffles double-precision (64-bit) floating-point elements in `a`
1191/// using the control in `imm8`.
1192///
1193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1194#[inline]
1195#[target_feature(enable = "avx")]
1196#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1197#[rustc_legacy_const_generics(1)]
1198#[stable(feature = "simd_x86", since = "1.27.0")]
1199pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1200    static_assert_uimm_bits!(IMM2, 2);
1201    unsafe {
1202        simd_shuffle!(
1203            a,
1204            _mm_undefined_pd(),
1205            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1206        )
1207    }
1208}
1209
1210/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1211/// floating-point elements) selected by `imm8` from `a` and `b`.
1212///
1213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1214#[inline]
1215#[target_feature(enable = "avx")]
1216#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1217#[rustc_legacy_const_generics(2)]
1218#[stable(feature = "simd_x86", since = "1.27.0")]
1219pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1220    static_assert_uimm_bits!(IMM8, 8);
1221    unsafe { vperm2f128ps256(a, b, IMM8 as i8) }
1222}
1223
1224/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1225/// floating-point elements) selected by `imm8` from `a` and `b`.
1226///
1227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1228#[inline]
1229#[target_feature(enable = "avx")]
1230#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1231#[rustc_legacy_const_generics(2)]
1232#[stable(feature = "simd_x86", since = "1.27.0")]
1233pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1234    static_assert_uimm_bits!(IMM8, 8);
1235    unsafe { vperm2f128pd256(a, b, IMM8 as i8) }
1236}
1237
1238/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1239/// from `a` and `b`.
1240///
1241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1242#[inline]
1243#[target_feature(enable = "avx")]
1244#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1245#[rustc_legacy_const_generics(2)]
1246#[stable(feature = "simd_x86", since = "1.27.0")]
1247pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1248    static_assert_uimm_bits!(IMM8, 8);
1249    unsafe { transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) }
1250}
1251
1252/// Broadcasts a single-precision (32-bit) floating-point element from memory
1253/// to all elements of the returned vector.
1254///
1255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1256#[inline]
1257#[target_feature(enable = "avx")]
1258#[cfg_attr(test, assert_instr(vbroadcastss))]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260#[allow(clippy::trivially_copy_pass_by_ref)]
1261pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1262    _mm256_set1_ps(*f)
1263}
1264
1265/// Broadcasts a single-precision (32-bit) floating-point element from memory
1266/// to all elements of the returned vector.
1267///
1268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1269#[inline]
1270#[target_feature(enable = "avx")]
1271#[cfg_attr(test, assert_instr(vbroadcastss))]
1272#[stable(feature = "simd_x86", since = "1.27.0")]
1273#[allow(clippy::trivially_copy_pass_by_ref)]
1274pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1275    _mm_set1_ps(*f)
1276}
1277
1278/// Broadcasts a double-precision (64-bit) floating-point element from memory
1279/// to all elements of the returned vector.
1280///
1281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1282#[inline]
1283#[target_feature(enable = "avx")]
1284#[cfg_attr(test, assert_instr(vbroadcastsd))]
1285#[stable(feature = "simd_x86", since = "1.27.0")]
1286#[allow(clippy::trivially_copy_pass_by_ref)]
1287pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1288    _mm256_set1_pd(*f)
1289}
1290
1291/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1292/// (32-bit) floating-point elements) to all elements of the returned vector.
1293///
1294/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1295#[inline]
1296#[target_feature(enable = "avx")]
1297#[cfg_attr(test, assert_instr(vbroadcastf128))]
1298#[stable(feature = "simd_x86", since = "1.27.0")]
1299pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1300    simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])
1301}
1302
1303/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1304/// (64-bit) floating-point elements) to all elements of the returned vector.
1305///
1306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1307#[inline]
1308#[target_feature(enable = "avx")]
1309#[cfg_attr(test, assert_instr(vbroadcastf128))]
1310#[stable(feature = "simd_x86", since = "1.27.0")]
1311pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1312    simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1])
1313}
1314
1315/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1316/// single-precision (32-bit) floating-point elements) from `b` into result
1317/// at the location specified by `imm8`.
1318///
1319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1320#[inline]
1321#[target_feature(enable = "avx")]
1322#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1323#[rustc_legacy_const_generics(2)]
1324#[stable(feature = "simd_x86", since = "1.27.0")]
1325pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1326    static_assert_uimm_bits!(IMM1, 1);
1327    unsafe {
1328        simd_shuffle!(
1329            a,
1330            _mm256_castps128_ps256(b),
1331            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1332        )
1333    }
1334}
1335
1336/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1337/// double-precision (64-bit) floating-point elements) from `b` into result
1338/// at the location specified by `imm8`.
1339///
1340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1341#[inline]
1342#[target_feature(enable = "avx")]
1343#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1344#[rustc_legacy_const_generics(2)]
1345#[stable(feature = "simd_x86", since = "1.27.0")]
1346pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1347    static_assert_uimm_bits!(IMM1, 1);
1348    unsafe {
1349        simd_shuffle!(
1350            a,
1351            _mm256_castpd128_pd256(b),
1352            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1353        )
1354    }
1355}
1356
1357/// Copies `a` to result, then inserts 128 bits from `b` into result
1358/// at the location specified by `imm8`.
1359///
1360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1361#[inline]
1362#[target_feature(enable = "avx")]
1363#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1364#[rustc_legacy_const_generics(2)]
1365#[stable(feature = "simd_x86", since = "1.27.0")]
1366pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1367    static_assert_uimm_bits!(IMM1, 1);
1368    unsafe {
1369        let dst: i64x4 = simd_shuffle!(
1370            a.as_i64x4(),
1371            _mm256_castsi128_si256(b).as_i64x4(),
1372            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1373        );
1374        transmute(dst)
1375    }
1376}
1377
1378/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1379/// at the location specified by `index`.
1380///
1381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1382#[inline]
1383#[target_feature(enable = "avx")]
1384// This intrinsic has no corresponding instruction.
1385#[rustc_legacy_const_generics(2)]
1386#[stable(feature = "simd_x86", since = "1.27.0")]
1387pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1388    static_assert_uimm_bits!(INDEX, 5);
1389    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1390}
1391
1392/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1393/// at the location specified by `index`.
1394///
1395/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1396#[inline]
1397#[target_feature(enable = "avx")]
1398// This intrinsic has no corresponding instruction.
1399#[rustc_legacy_const_generics(2)]
1400#[stable(feature = "simd_x86", since = "1.27.0")]
1401pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1402    static_assert_uimm_bits!(INDEX, 4);
1403    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1404}
1405
1406/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1407/// at the location specified by `index`.
1408///
1409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1410#[inline]
1411#[target_feature(enable = "avx")]
1412// This intrinsic has no corresponding instruction.
1413#[rustc_legacy_const_generics(2)]
1414#[stable(feature = "simd_x86", since = "1.27.0")]
1415pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1416    static_assert_uimm_bits!(INDEX, 3);
1417    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1418}
1419
1420/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1421/// floating-point elements) from memory into result.
1422/// `mem_addr` must be aligned on a 32-byte boundary or a
1423/// general-protection exception may be generated.
1424///
1425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1426#[inline]
1427#[target_feature(enable = "avx")]
1428#[cfg_attr(
1429    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1430    assert_instr(vmovap)
1431)]
1432#[stable(feature = "simd_x86", since = "1.27.0")]
1433#[allow(clippy::cast_ptr_alignment)]
1434pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1435    *(mem_addr as *const __m256d)
1436}
1437
1438/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1439/// floating-point elements) from `a` into memory.
1440/// `mem_addr` must be aligned on a 32-byte boundary or a
1441/// general-protection exception may be generated.
1442///
1443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1444#[inline]
1445#[target_feature(enable = "avx")]
1446#[cfg_attr(
1447    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1448    assert_instr(vmovap)
1449)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451#[allow(clippy::cast_ptr_alignment)]
1452pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1453    *(mem_addr as *mut __m256d) = a;
1454}
1455
1456/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1457/// floating-point elements) from memory into result.
1458/// `mem_addr` must be aligned on a 32-byte boundary or a
1459/// general-protection exception may be generated.
1460///
1461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1462#[inline]
1463#[target_feature(enable = "avx")]
1464#[cfg_attr(
1465    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1466    assert_instr(vmovaps)
1467)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469#[allow(clippy::cast_ptr_alignment)]
1470pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1471    *(mem_addr as *const __m256)
1472}
1473
1474/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1475/// floating-point elements) from `a` into memory.
1476/// `mem_addr` must be aligned on a 32-byte boundary or a
1477/// general-protection exception may be generated.
1478///
1479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1480#[inline]
1481#[target_feature(enable = "avx")]
1482#[cfg_attr(
1483    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1484    assert_instr(vmovaps)
1485)]
1486#[stable(feature = "simd_x86", since = "1.27.0")]
1487#[allow(clippy::cast_ptr_alignment)]
1488pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1489    *(mem_addr as *mut __m256) = a;
1490}
1491
1492/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1493/// floating-point elements) from memory into result.
1494/// `mem_addr` does not need to be aligned on any particular boundary.
1495///
1496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1497#[inline]
1498#[target_feature(enable = "avx")]
1499#[cfg_attr(test, assert_instr(vmovup))]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1502    let mut dst = _mm256_undefined_pd();
1503    ptr::copy_nonoverlapping(
1504        mem_addr as *const u8,
1505        ptr::addr_of_mut!(dst) as *mut u8,
1506        mem::size_of::<__m256d>(),
1507    );
1508    dst
1509}
1510
1511/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1512/// floating-point elements) from `a` into memory.
1513/// `mem_addr` does not need to be aligned on any particular boundary.
1514///
1515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1516#[inline]
1517#[target_feature(enable = "avx")]
1518#[cfg_attr(test, assert_instr(vmovup))]
1519#[stable(feature = "simd_x86", since = "1.27.0")]
1520pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1521    mem_addr.cast::<__m256d>().write_unaligned(a);
1522}
1523
1524/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1525/// floating-point elements) from memory into result.
1526/// `mem_addr` does not need to be aligned on any particular boundary.
1527///
1528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1529#[inline]
1530#[target_feature(enable = "avx")]
1531#[cfg_attr(test, assert_instr(vmovups))]
1532#[stable(feature = "simd_x86", since = "1.27.0")]
1533pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1534    let mut dst = _mm256_undefined_ps();
1535    ptr::copy_nonoverlapping(
1536        mem_addr as *const u8,
1537        ptr::addr_of_mut!(dst) as *mut u8,
1538        mem::size_of::<__m256>(),
1539    );
1540    dst
1541}
1542
1543/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1544/// floating-point elements) from `a` into memory.
1545/// `mem_addr` does not need to be aligned on any particular boundary.
1546///
1547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1548#[inline]
1549#[target_feature(enable = "avx")]
1550#[cfg_attr(test, assert_instr(vmovups))]
1551#[stable(feature = "simd_x86", since = "1.27.0")]
1552pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1553    mem_addr.cast::<__m256>().write_unaligned(a);
1554}
1555
1556/// Loads 256-bits of integer data from memory into result.
1557/// `mem_addr` must be aligned on a 32-byte boundary or a
1558/// general-protection exception may be generated.
1559///
1560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1561#[inline]
1562#[target_feature(enable = "avx")]
1563#[cfg_attr(
1564    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1565    assert_instr(vmovaps)
1566)] // FIXME vmovdqa expected
1567#[stable(feature = "simd_x86", since = "1.27.0")]
1568pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1569    *mem_addr
1570}
1571
1572/// Stores 256-bits of integer data from `a` into memory.
1573/// `mem_addr` must be aligned on a 32-byte boundary or a
1574/// general-protection exception may be generated.
1575///
1576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1577#[inline]
1578#[target_feature(enable = "avx")]
1579#[cfg_attr(
1580    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1581    assert_instr(vmovaps)
1582)] // FIXME vmovdqa expected
1583#[stable(feature = "simd_x86", since = "1.27.0")]
1584pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1585    *mem_addr = a;
1586}
1587
1588/// Loads 256-bits of integer data from memory into result.
1589/// `mem_addr` does not need to be aligned on any particular boundary.
1590///
1591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1592#[inline]
1593#[target_feature(enable = "avx")]
1594#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1597    let mut dst = _mm256_undefined_si256();
1598    ptr::copy_nonoverlapping(
1599        mem_addr as *const u8,
1600        ptr::addr_of_mut!(dst) as *mut u8,
1601        mem::size_of::<__m256i>(),
1602    );
1603    dst
1604}
1605
1606/// Stores 256-bits of integer data from `a` into memory.
1607/// `mem_addr` does not need to be aligned on any particular boundary.
1608///
1609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1610#[inline]
1611#[target_feature(enable = "avx")]
1612#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1613#[stable(feature = "simd_x86", since = "1.27.0")]
1614pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1615    mem_addr.write_unaligned(a);
1616}
1617
1618/// Loads packed double-precision (64-bit) floating-point elements from memory
1619/// into result using `mask` (elements are zeroed out when the high bit of the
1620/// corresponding element is not set).
1621///
1622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1623#[inline]
1624#[target_feature(enable = "avx")]
1625#[cfg_attr(test, assert_instr(vmaskmovpd))]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1628    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1629}
1630
1631/// Stores packed double-precision (64-bit) floating-point elements from `a`
1632/// into memory using `mask`.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1635#[inline]
1636#[target_feature(enable = "avx")]
1637#[cfg_attr(test, assert_instr(vmaskmovpd))]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1640    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1641}
1642
1643/// Loads packed double-precision (64-bit) floating-point elements from memory
1644/// into result using `mask` (elements are zeroed out when the high bit of the
1645/// corresponding element is not set).
1646///
1647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1648#[inline]
1649#[target_feature(enable = "avx")]
1650#[cfg_attr(test, assert_instr(vmaskmovpd))]
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1653    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1654}
1655
1656/// Stores packed double-precision (64-bit) floating-point elements from `a`
1657/// into memory using `mask`.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmaskmovpd))]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1665    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1666}
1667
1668/// Loads packed single-precision (32-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovps))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1678    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1679}
1680
1681/// Stores packed single-precision (32-bit) floating-point elements from `a`
1682/// into memory using `mask`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1685#[inline]
1686#[target_feature(enable = "avx")]
1687#[cfg_attr(test, assert_instr(vmaskmovps))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1690    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1691}
1692
1693/// Loads packed single-precision (32-bit) floating-point elements from memory
1694/// into result using `mask` (elements are zeroed out when the high bit of the
1695/// corresponding element is not set).
1696///
1697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1698#[inline]
1699#[target_feature(enable = "avx")]
1700#[cfg_attr(test, assert_instr(vmaskmovps))]
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1703    maskloadps(mem_addr as *const i8, mask.as_i32x4())
1704}
1705
1706/// Stores packed single-precision (32-bit) floating-point elements from `a`
1707/// into memory using `mask`.
1708///
1709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1710#[inline]
1711#[target_feature(enable = "avx")]
1712#[cfg_attr(test, assert_instr(vmaskmovps))]
1713#[stable(feature = "simd_x86", since = "1.27.0")]
1714pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1715    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1716}
1717
1718/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1719/// from `a`, and returns the results.
1720///
1721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1722#[inline]
1723#[target_feature(enable = "avx")]
1724#[cfg_attr(test, assert_instr(vmovshdup))]
1725#[stable(feature = "simd_x86", since = "1.27.0")]
1726pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1727    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1728}
1729
1730/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1731/// from `a`, and returns the results.
1732///
1733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1734#[inline]
1735#[target_feature(enable = "avx")]
1736#[cfg_attr(test, assert_instr(vmovsldup))]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1739    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1740}
1741
1742/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1743/// from `a`, and returns the results.
1744///
1745/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1746#[inline]
1747#[target_feature(enable = "avx")]
1748#[cfg_attr(test, assert_instr(vmovddup))]
1749#[stable(feature = "simd_x86", since = "1.27.0")]
1750pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1751    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1752}
1753
1754/// Loads 256-bits of integer data from unaligned memory into result.
1755/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1756/// data crosses a cache line boundary.
1757///
1758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1759#[inline]
1760#[target_feature(enable = "avx")]
1761#[cfg_attr(test, assert_instr(vlddqu))]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1764    transmute(vlddqu(mem_addr as *const i8))
1765}
1766
1767/// Moves integer data from a 256-bit integer vector to a 32-byte
1768/// aligned memory location. To minimize caching, the data is flagged as
1769/// non-temporal (unlikely to be used again soon)
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1772///
1773/// # Safety of non-temporal stores
1774///
1775/// After using this intrinsic, but before any other access to the memory that this intrinsic
1776/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1777/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1778/// return.
1779///
1780/// See [`_mm_sfence`] for details.
1781#[inline]
1782#[target_feature(enable = "avx")]
1783#[cfg_attr(test, assert_instr(vmovntdq))]
1784#[stable(feature = "simd_x86", since = "1.27.0")]
1785pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1786    crate::arch::asm!(
1787        vps!("vmovntdq", ",{a}"),
1788        p = in(reg) mem_addr,
1789        a = in(ymm_reg) a,
1790        options(nostack, preserves_flags),
1791    );
1792}
1793
1794/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1795/// to a 32-byte aligned memory location. To minimize caching, the data is
1796/// flagged as non-temporal (unlikely to be used again soon).
1797///
1798/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1799///
1800/// # Safety of non-temporal stores
1801///
1802/// After using this intrinsic, but before any other access to the memory that this intrinsic
1803/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1804/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1805/// return.
1806///
1807/// See [`_mm_sfence`] for details.
1808#[inline]
1809#[target_feature(enable = "avx")]
1810#[cfg_attr(test, assert_instr(vmovntpd))]
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812#[allow(clippy::cast_ptr_alignment)]
1813pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1814    crate::arch::asm!(
1815        vps!("vmovntpd", ",{a}"),
1816        p = in(reg) mem_addr,
1817        a = in(ymm_reg) a,
1818        options(nostack, preserves_flags),
1819    );
1820}
1821
1822/// Moves single-precision floating point values from a 256-bit vector
1823/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1824/// caching, the data is flagged as non-temporal (unlikely to be used again
1825/// soon).
1826///
1827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1828///
1829/// # Safety of non-temporal stores
1830///
1831/// After using this intrinsic, but before any other access to the memory that this intrinsic
1832/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1833/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1834/// return.
1835///
1836/// See [`_mm_sfence`] for details.
1837#[inline]
1838#[target_feature(enable = "avx")]
1839#[cfg_attr(test, assert_instr(vmovntps))]
1840#[stable(feature = "simd_x86", since = "1.27.0")]
1841#[allow(clippy::cast_ptr_alignment)]
1842pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1843    crate::arch::asm!(
1844        vps!("vmovntps", ",{a}"),
1845        p = in(reg) mem_addr,
1846        a = in(ymm_reg) a,
1847        options(nostack, preserves_flags),
1848    );
1849}
1850
1851/// Computes the approximate reciprocal of packed single-precision (32-bit)
1852/// floating-point elements in `a`, and returns the results. The maximum
1853/// relative error for this approximation is less than 1.5*2^-12.
1854///
1855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1856#[inline]
1857#[target_feature(enable = "avx")]
1858#[cfg_attr(test, assert_instr(vrcpps))]
1859#[stable(feature = "simd_x86", since = "1.27.0")]
1860pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1861    unsafe { vrcpps(a) }
1862}
1863
1864/// Computes the approximate reciprocal square root of packed single-precision
1865/// (32-bit) floating-point elements in `a`, and returns the results.
1866/// The maximum relative error for this approximation is less than 1.5*2^-12.
1867///
1868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1869#[inline]
1870#[target_feature(enable = "avx")]
1871#[cfg_attr(test, assert_instr(vrsqrtps))]
1872#[stable(feature = "simd_x86", since = "1.27.0")]
1873pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1874    unsafe { vrsqrtps(a) }
1875}
1876
1877/// Unpacks and interleave double-precision (64-bit) floating-point elements
1878/// from the high half of each 128-bit lane in `a` and `b`.
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1881#[inline]
1882#[target_feature(enable = "avx")]
1883#[cfg_attr(test, assert_instr(vunpckhpd))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1886    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1887}
1888
1889/// Unpacks and interleave single-precision (32-bit) floating-point elements
1890/// from the high half of each 128-bit lane in `a` and `b`.
1891///
1892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1893#[inline]
1894#[target_feature(enable = "avx")]
1895#[cfg_attr(test, assert_instr(vunpckhps))]
1896#[stable(feature = "simd_x86", since = "1.27.0")]
1897pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1898    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1899}
1900
1901/// Unpacks and interleave double-precision (64-bit) floating-point elements
1902/// from the low half of each 128-bit lane in `a` and `b`.
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1905#[inline]
1906#[target_feature(enable = "avx")]
1907#[cfg_attr(test, assert_instr(vunpcklpd))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1910    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1911}
1912
1913/// Unpacks and interleave single-precision (32-bit) floating-point elements
1914/// from the low half of each 128-bit lane in `a` and `b`.
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1917#[inline]
1918#[target_feature(enable = "avx")]
1919#[cfg_attr(test, assert_instr(vunpcklps))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1922    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1923}
1924
1925/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1926/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1927/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1928/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1929///
1930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1931#[inline]
1932#[target_feature(enable = "avx")]
1933#[cfg_attr(test, assert_instr(vptest))]
1934#[stable(feature = "simd_x86", since = "1.27.0")]
1935pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1936    unsafe { ptestz256(a.as_i64x4(), b.as_i64x4()) }
1937}
1938
1939/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1940/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1941/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1942/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
1945#[inline]
1946#[target_feature(enable = "avx")]
1947#[cfg_attr(test, assert_instr(vptest))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1950    unsafe { ptestc256(a.as_i64x4(), b.as_i64x4()) }
1951}
1952
1953/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1954/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1955/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1956/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1957/// `CF` values are zero, otherwise return 0.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
1960#[inline]
1961#[target_feature(enable = "avx")]
1962#[cfg_attr(test, assert_instr(vptest))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1965    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
1966}
1967
1968/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1969/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1970/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1971/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1972/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1973/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1974/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
1977#[inline]
1978#[target_feature(enable = "avx")]
1979#[cfg_attr(test, assert_instr(vtestpd))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1982    unsafe { vtestzpd256(a, b) }
1983}
1984
1985/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1986/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1987/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1988/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1989/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1990/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1991/// is zero, otherwise set `CF` to 0. Return the `CF` value.
1992///
1993/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
1994#[inline]
1995#[target_feature(enable = "avx")]
1996#[cfg_attr(test, assert_instr(vtestpd))]
1997#[stable(feature = "simd_x86", since = "1.27.0")]
1998pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1999    unsafe { vtestcpd256(a, b) }
2000}
2001
2002/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2003/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2004/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2005/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2006/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2007/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2008/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2009/// are zero, otherwise return 0.
2010///
2011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2012#[inline]
2013#[target_feature(enable = "avx")]
2014#[cfg_attr(test, assert_instr(vtestpd))]
2015#[stable(feature = "simd_x86", since = "1.27.0")]
2016pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2017    unsafe { vtestnzcpd256(a, b) }
2018}
2019
2020/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2021/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2022/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2023/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2024/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2025/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2026/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2027///
2028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2029#[inline]
2030#[target_feature(enable = "avx")]
2031#[cfg_attr(test, assert_instr(vtestpd))]
2032#[stable(feature = "simd_x86", since = "1.27.0")]
2033pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2034    unsafe { vtestzpd(a, b) }
2035}
2036
2037/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2038/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2039/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2040/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2041/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2042/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2043/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2046#[inline]
2047#[target_feature(enable = "avx")]
2048#[cfg_attr(test, assert_instr(vtestpd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2051    unsafe { vtestcpd(a, b) }
2052}
2053
2054/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2055/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2056/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2057/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2058/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2059/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2060/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2061/// are zero, otherwise return 0.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2064#[inline]
2065#[target_feature(enable = "avx")]
2066#[cfg_attr(test, assert_instr(vtestpd))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2069    unsafe { vtestnzcpd(a, b) }
2070}
2071
2072/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2073/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2074/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2075/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2076/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2077/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2078/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2079///
2080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2081#[inline]
2082#[target_feature(enable = "avx")]
2083#[cfg_attr(test, assert_instr(vtestps))]
2084#[stable(feature = "simd_x86", since = "1.27.0")]
2085pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2086    unsafe { vtestzps256(a, b) }
2087}
2088
2089/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2090/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2091/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2092/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2093/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2094/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2095/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2096///
2097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2098#[inline]
2099#[target_feature(enable = "avx")]
2100#[cfg_attr(test, assert_instr(vtestps))]
2101#[stable(feature = "simd_x86", since = "1.27.0")]
2102pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2103    unsafe { vtestcps256(a, b) }
2104}
2105
2106/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2107/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2108/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2109/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2110/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2111/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2112/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2113/// are zero, otherwise return 0.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vtestps))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2121    unsafe { vtestnzcps256(a, b) }
2122}
2123
2124/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2125/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2126/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2127/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2128/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2129/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2130/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2131///
2132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2133#[inline]
2134#[target_feature(enable = "avx")]
2135#[cfg_attr(test, assert_instr(vtestps))]
2136#[stable(feature = "simd_x86", since = "1.27.0")]
2137pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2138    unsafe { vtestzps(a, b) }
2139}
2140
2141/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2142/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2143/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2144/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2145/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2146/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2147/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2150#[inline]
2151#[target_feature(enable = "avx")]
2152#[cfg_attr(test, assert_instr(vtestps))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2155    unsafe { vtestcps(a, b) }
2156}
2157
2158/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2159/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2160/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2161/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2162/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2163/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2164/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2165/// are zero, otherwise return 0.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2168#[inline]
2169#[target_feature(enable = "avx")]
2170#[cfg_attr(test, assert_instr(vtestps))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2173    unsafe { vtestnzcps(a, b) }
2174}
2175
2176/// Sets each bit of the returned mask based on the most significant bit of the
2177/// corresponding packed double-precision (64-bit) floating-point element in
2178/// `a`.
2179///
2180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2181#[inline]
2182#[target_feature(enable = "avx")]
2183#[cfg_attr(test, assert_instr(vmovmskpd))]
2184#[stable(feature = "simd_x86", since = "1.27.0")]
2185pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2186    // Propagate the highest bit to the rest, because simd_bitmask
2187    // requires all-1 or all-0.
2188    unsafe {
2189        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2190        simd_bitmask::<i64x4, u8>(mask).into()
2191    }
2192}
2193
2194/// Sets each bit of the returned mask based on the most significant bit of the
2195/// corresponding packed single-precision (32-bit) floating-point element in
2196/// `a`.
2197///
2198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2199#[inline]
2200#[target_feature(enable = "avx")]
2201#[cfg_attr(test, assert_instr(vmovmskps))]
2202#[stable(feature = "simd_x86", since = "1.27.0")]
2203pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2204    // Propagate the highest bit to the rest, because simd_bitmask
2205    // requires all-1 or all-0.
2206    unsafe {
2207        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2208        simd_bitmask::<i32x8, u8>(mask).into()
2209    }
2210}
2211
2212/// Returns vector of type __m256d with all elements set to zero.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2215#[inline]
2216#[target_feature(enable = "avx")]
2217#[cfg_attr(test, assert_instr(vxorp))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219pub fn _mm256_setzero_pd() -> __m256d {
2220    const { unsafe { mem::zeroed() } }
2221}
2222
2223/// Returns vector of type __m256 with all elements set to zero.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2226#[inline]
2227#[target_feature(enable = "avx")]
2228#[cfg_attr(test, assert_instr(vxorps))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm256_setzero_ps() -> __m256 {
2231    const { unsafe { mem::zeroed() } }
2232}
2233
2234/// Returns vector of type __m256i with all elements set to zero.
2235///
2236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2237#[inline]
2238#[target_feature(enable = "avx")]
2239#[cfg_attr(test, assert_instr(vxor))]
2240#[stable(feature = "simd_x86", since = "1.27.0")]
2241pub fn _mm256_setzero_si256() -> __m256i {
2242    const { unsafe { mem::zeroed() } }
2243}
2244
2245/// Sets packed double-precision (64-bit) floating-point elements in returned
2246/// vector with the supplied values.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2249#[inline]
2250#[target_feature(enable = "avx")]
2251// This intrinsic has no corresponding instruction.
2252#[cfg_attr(test, assert_instr(vinsertf128))]
2253#[stable(feature = "simd_x86", since = "1.27.0")]
2254pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2255    _mm256_setr_pd(d, c, b, a)
2256}
2257
2258/// Sets packed single-precision (32-bit) floating-point elements in returned
2259/// vector with the supplied values.
2260///
2261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2262#[inline]
2263#[target_feature(enable = "avx")]
2264// This intrinsic has no corresponding instruction.
2265#[stable(feature = "simd_x86", since = "1.27.0")]
2266pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2267    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2268}
2269
2270/// Sets packed 8-bit integers in returned vector with the supplied values.
2271///
2272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2273#[inline]
2274#[target_feature(enable = "avx")]
2275// This intrinsic has no corresponding instruction.
2276#[stable(feature = "simd_x86", since = "1.27.0")]
2277pub fn _mm256_set_epi8(
2278    e00: i8,
2279    e01: i8,
2280    e02: i8,
2281    e03: i8,
2282    e04: i8,
2283    e05: i8,
2284    e06: i8,
2285    e07: i8,
2286    e08: i8,
2287    e09: i8,
2288    e10: i8,
2289    e11: i8,
2290    e12: i8,
2291    e13: i8,
2292    e14: i8,
2293    e15: i8,
2294    e16: i8,
2295    e17: i8,
2296    e18: i8,
2297    e19: i8,
2298    e20: i8,
2299    e21: i8,
2300    e22: i8,
2301    e23: i8,
2302    e24: i8,
2303    e25: i8,
2304    e26: i8,
2305    e27: i8,
2306    e28: i8,
2307    e29: i8,
2308    e30: i8,
2309    e31: i8,
2310) -> __m256i {
2311    #[rustfmt::skip]
2312    _mm256_setr_epi8(
2313        e31, e30, e29, e28, e27, e26, e25, e24,
2314        e23, e22, e21, e20, e19, e18, e17, e16,
2315        e15, e14, e13, e12, e11, e10, e09, e08,
2316        e07, e06, e05, e04, e03, e02, e01, e00,
2317    )
2318}
2319
2320/// Sets packed 16-bit integers in returned vector with the supplied values.
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2323#[inline]
2324#[target_feature(enable = "avx")]
2325// This intrinsic has no corresponding instruction.
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm256_set_epi16(
2328    e00: i16,
2329    e01: i16,
2330    e02: i16,
2331    e03: i16,
2332    e04: i16,
2333    e05: i16,
2334    e06: i16,
2335    e07: i16,
2336    e08: i16,
2337    e09: i16,
2338    e10: i16,
2339    e11: i16,
2340    e12: i16,
2341    e13: i16,
2342    e14: i16,
2343    e15: i16,
2344) -> __m256i {
2345    #[rustfmt::skip]
2346    _mm256_setr_epi16(
2347        e15, e14, e13, e12,
2348        e11, e10, e09, e08,
2349        e07, e06, e05, e04,
2350        e03, e02, e01, e00,
2351    )
2352}
2353
2354/// Sets packed 32-bit integers in returned vector with the supplied values.
2355///
2356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2357#[inline]
2358#[target_feature(enable = "avx")]
2359// This intrinsic has no corresponding instruction.
2360#[stable(feature = "simd_x86", since = "1.27.0")]
2361pub fn _mm256_set_epi32(
2362    e0: i32,
2363    e1: i32,
2364    e2: i32,
2365    e3: i32,
2366    e4: i32,
2367    e5: i32,
2368    e6: i32,
2369    e7: i32,
2370) -> __m256i {
2371    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2372}
2373
2374/// Sets packed 64-bit integers in returned vector with the supplied values.
2375///
2376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2377#[inline]
2378#[target_feature(enable = "avx")]
2379// This intrinsic has no corresponding instruction.
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2382    _mm256_setr_epi64x(d, c, b, a)
2383}
2384
2385/// Sets packed double-precision (64-bit) floating-point elements in returned
2386/// vector with the supplied values in reverse order.
2387///
2388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2389#[inline]
2390#[target_feature(enable = "avx")]
2391// This intrinsic has no corresponding instruction.
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2394    __m256d([a, b, c, d])
2395}
2396
2397/// Sets packed single-precision (32-bit) floating-point elements in returned
2398/// vector with the supplied values in reverse order.
2399///
2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2401#[inline]
2402#[target_feature(enable = "avx")]
2403// This intrinsic has no corresponding instruction.
2404#[stable(feature = "simd_x86", since = "1.27.0")]
2405pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2406    __m256([a, b, c, d, e, f, g, h])
2407}
2408
2409/// Sets packed 8-bit integers in returned vector with the supplied values in
2410/// reverse order.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2413#[inline]
2414#[target_feature(enable = "avx")]
2415// This intrinsic has no corresponding instruction.
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417pub fn _mm256_setr_epi8(
2418    e00: i8,
2419    e01: i8,
2420    e02: i8,
2421    e03: i8,
2422    e04: i8,
2423    e05: i8,
2424    e06: i8,
2425    e07: i8,
2426    e08: i8,
2427    e09: i8,
2428    e10: i8,
2429    e11: i8,
2430    e12: i8,
2431    e13: i8,
2432    e14: i8,
2433    e15: i8,
2434    e16: i8,
2435    e17: i8,
2436    e18: i8,
2437    e19: i8,
2438    e20: i8,
2439    e21: i8,
2440    e22: i8,
2441    e23: i8,
2442    e24: i8,
2443    e25: i8,
2444    e26: i8,
2445    e27: i8,
2446    e28: i8,
2447    e29: i8,
2448    e30: i8,
2449    e31: i8,
2450) -> __m256i {
2451    unsafe {
2452        #[rustfmt::skip]
2453        transmute(i8x32::new(
2454            e00, e01, e02, e03, e04, e05, e06, e07,
2455            e08, e09, e10, e11, e12, e13, e14, e15,
2456            e16, e17, e18, e19, e20, e21, e22, e23,
2457            e24, e25, e26, e27, e28, e29, e30, e31,
2458        ))
2459    }
2460}
2461
2462/// Sets packed 16-bit integers in returned vector with the supplied values in
2463/// reverse order.
2464///
2465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2466#[inline]
2467#[target_feature(enable = "avx")]
2468// This intrinsic has no corresponding instruction.
2469#[stable(feature = "simd_x86", since = "1.27.0")]
2470pub fn _mm256_setr_epi16(
2471    e00: i16,
2472    e01: i16,
2473    e02: i16,
2474    e03: i16,
2475    e04: i16,
2476    e05: i16,
2477    e06: i16,
2478    e07: i16,
2479    e08: i16,
2480    e09: i16,
2481    e10: i16,
2482    e11: i16,
2483    e12: i16,
2484    e13: i16,
2485    e14: i16,
2486    e15: i16,
2487) -> __m256i {
2488    unsafe {
2489        #[rustfmt::skip]
2490        transmute(i16x16::new(
2491            e00, e01, e02, e03,
2492            e04, e05, e06, e07,
2493            e08, e09, e10, e11,
2494            e12, e13, e14, e15,
2495        ))
2496    }
2497}
2498
2499/// Sets packed 32-bit integers in returned vector with the supplied values in
2500/// reverse order.
2501///
2502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2503#[inline]
2504#[target_feature(enable = "avx")]
2505// This intrinsic has no corresponding instruction.
2506#[stable(feature = "simd_x86", since = "1.27.0")]
2507pub fn _mm256_setr_epi32(
2508    e0: i32,
2509    e1: i32,
2510    e2: i32,
2511    e3: i32,
2512    e4: i32,
2513    e5: i32,
2514    e6: i32,
2515    e7: i32,
2516) -> __m256i {
2517    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2518}
2519
2520/// Sets packed 64-bit integers in returned vector with the supplied values in
2521/// reverse order.
2522///
2523/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2524#[inline]
2525#[target_feature(enable = "avx")]
2526// This intrinsic has no corresponding instruction.
2527#[stable(feature = "simd_x86", since = "1.27.0")]
2528pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2529    unsafe { transmute(i64x4::new(a, b, c, d)) }
2530}
2531
2532/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2533/// elements of returned vector.
2534///
2535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2536#[inline]
2537#[target_feature(enable = "avx")]
2538// This intrinsic has no corresponding instruction.
2539#[stable(feature = "simd_x86", since = "1.27.0")]
2540pub fn _mm256_set1_pd(a: f64) -> __m256d {
2541    _mm256_setr_pd(a, a, a, a)
2542}
2543
2544/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2545/// elements of returned vector.
2546///
2547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2548#[inline]
2549#[target_feature(enable = "avx")]
2550// This intrinsic has no corresponding instruction.
2551#[stable(feature = "simd_x86", since = "1.27.0")]
2552pub fn _mm256_set1_ps(a: f32) -> __m256 {
2553    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2554}
2555
2556/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2557/// This intrinsic may generate the `vpbroadcastb`.
2558///
2559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2560#[inline]
2561#[target_feature(enable = "avx")]
2562// This intrinsic has no corresponding instruction.
2563#[stable(feature = "simd_x86", since = "1.27.0")]
2564pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2565    #[rustfmt::skip]
2566    _mm256_setr_epi8(
2567        a, a, a, a, a, a, a, a,
2568        a, a, a, a, a, a, a, a,
2569        a, a, a, a, a, a, a, a,
2570        a, a, a, a, a, a, a, a,
2571    )
2572}
2573
2574/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2575/// This intrinsic may generate the `vpbroadcastw`.
2576///
2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2578#[inline]
2579#[target_feature(enable = "avx")]
2580//#[cfg_attr(test, assert_instr(vpshufb))]
2581#[cfg_attr(test, assert_instr(vinsertf128))]
2582// This intrinsic has no corresponding instruction.
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2585    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2586}
2587
2588/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2589/// This intrinsic may generate the `vpbroadcastd`.
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2592#[inline]
2593#[target_feature(enable = "avx")]
2594// This intrinsic has no corresponding instruction.
2595#[stable(feature = "simd_x86", since = "1.27.0")]
2596pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2597    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2598}
2599
2600/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2601/// This intrinsic may generate the `vpbroadcastq`.
2602///
2603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2604#[inline]
2605#[target_feature(enable = "avx")]
2606#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2607#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2608// This intrinsic has no corresponding instruction.
2609#[stable(feature = "simd_x86", since = "1.27.0")]
2610pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2611    _mm256_setr_epi64x(a, a, a, a)
2612}
2613
2614/// Cast vector of type __m256d to type __m256.
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2617#[inline]
2618#[target_feature(enable = "avx")]
2619// This intrinsic is only used for compilation and does not generate any
2620// instructions, thus it has zero latency.
2621#[stable(feature = "simd_x86", since = "1.27.0")]
2622pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2623    unsafe { transmute(a) }
2624}
2625
2626/// Cast vector of type __m256 to type __m256d.
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2629#[inline]
2630#[target_feature(enable = "avx")]
2631// This intrinsic is only used for compilation and does not generate any
2632// instructions, thus it has zero latency.
2633#[stable(feature = "simd_x86", since = "1.27.0")]
2634pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2635    unsafe { transmute(a) }
2636}
2637
2638/// Casts vector of type __m256 to type __m256i.
2639///
2640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2641#[inline]
2642#[target_feature(enable = "avx")]
2643// This intrinsic is only used for compilation and does not generate any
2644// instructions, thus it has zero latency.
2645#[stable(feature = "simd_x86", since = "1.27.0")]
2646pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2647    unsafe { transmute(a) }
2648}
2649
2650/// Casts vector of type __m256i to type __m256.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2653#[inline]
2654#[target_feature(enable = "avx")]
2655// This intrinsic is only used for compilation and does not generate any
2656// instructions, thus it has zero latency.
2657#[stable(feature = "simd_x86", since = "1.27.0")]
2658pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2659    unsafe { transmute(a) }
2660}
2661
2662/// Casts vector of type __m256d to type __m256i.
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2665#[inline]
2666#[target_feature(enable = "avx")]
2667// This intrinsic is only used for compilation and does not generate any
2668// instructions, thus it has zero latency.
2669#[stable(feature = "simd_x86", since = "1.27.0")]
2670pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2671    unsafe { transmute(a) }
2672}
2673
2674/// Casts vector of type __m256i to type __m256d.
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2677#[inline]
2678#[target_feature(enable = "avx")]
2679// This intrinsic is only used for compilation and does not generate any
2680// instructions, thus it has zero latency.
2681#[stable(feature = "simd_x86", since = "1.27.0")]
2682pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2683    unsafe { transmute(a) }
2684}
2685
2686/// Casts vector of type __m256 to type __m128.
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2689#[inline]
2690#[target_feature(enable = "avx")]
2691// This intrinsic is only used for compilation and does not generate any
2692// instructions, thus it has zero latency.
2693#[stable(feature = "simd_x86", since = "1.27.0")]
2694pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2695    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2696}
2697
2698/// Casts vector of type __m256d to type __m128d.
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2701#[inline]
2702#[target_feature(enable = "avx")]
2703// This intrinsic is only used for compilation and does not generate any
2704// instructions, thus it has zero latency.
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2707    unsafe { simd_shuffle!(a, a, [0, 1]) }
2708}
2709
2710/// Casts vector of type __m256i to type __m128i.
2711///
2712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2713#[inline]
2714#[target_feature(enable = "avx")]
2715// This intrinsic is only used for compilation and does not generate any
2716// instructions, thus it has zero latency.
2717#[stable(feature = "simd_x86", since = "1.27.0")]
2718pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2719    unsafe {
2720        let a = a.as_i64x4();
2721        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2722        transmute(dst)
2723    }
2724}
2725
2726/// Casts vector of type __m128 to type __m256;
2727/// the upper 128 bits of the result are undefined.
2728///
2729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2730#[inline]
2731#[target_feature(enable = "avx")]
2732// This intrinsic is only used for compilation and does not generate any
2733// instructions, thus it has zero latency.
2734#[stable(feature = "simd_x86", since = "1.27.0")]
2735pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2736    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2737}
2738
2739/// Casts vector of type __m128d to type __m256d;
2740/// the upper 128 bits of the result are undefined.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2743#[inline]
2744#[target_feature(enable = "avx")]
2745// This intrinsic is only used for compilation and does not generate any
2746// instructions, thus it has zero latency.
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2749    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2750}
2751
2752/// Casts vector of type __m128i to type __m256i;
2753/// the upper 128 bits of the result are undefined.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic is only used for compilation and does not generate any
2759// instructions, thus it has zero latency.
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2762    unsafe {
2763        let a = a.as_i64x2();
2764        let undefined = i64x2::ZERO;
2765        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2766        transmute(dst)
2767    }
2768}
2769
2770/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2771/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2772/// the value of the source vector. The upper 128 bits are set to zero.
2773///
2774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2775#[inline]
2776#[target_feature(enable = "avx")]
2777// This intrinsic is only used for compilation and does not generate any
2778// instructions, thus it has zero latency.
2779#[stable(feature = "simd_x86", since = "1.27.0")]
2780pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2781    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2782}
2783
2784/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2785/// The lower 128 bits contain the value of the source vector. The upper
2786/// 128 bits are set to zero.
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2789#[inline]
2790#[target_feature(enable = "avx")]
2791// This intrinsic is only used for compilation and does not generate any
2792// instructions, thus it has zero latency.
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2795    unsafe {
2796        let b = i64x2::ZERO;
2797        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2798        transmute(dst)
2799    }
2800}
2801
2802/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2803/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2804/// contain the value of the source vector. The upper 128 bits are set
2805/// to zero.
2806///
2807/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2808#[inline]
2809#[target_feature(enable = "avx")]
2810// This intrinsic is only used for compilation and does not generate any
2811// instructions, thus it has zero latency.
2812#[stable(feature = "simd_x86", since = "1.27.0")]
2813pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2814    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2815}
2816
2817/// Returns vector of type `__m256` with indeterminate elements.
2818/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2819/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2820/// In practice, this is typically equivalent to [`mem::zeroed`].
2821///
2822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2823#[inline]
2824#[target_feature(enable = "avx")]
2825// This intrinsic has no corresponding instruction.
2826#[stable(feature = "simd_x86", since = "1.27.0")]
2827pub fn _mm256_undefined_ps() -> __m256 {
2828    const { unsafe { mem::zeroed() } }
2829}
2830
2831/// Returns vector of type `__m256d` with indeterminate elements.
2832/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2833/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2834/// In practice, this is typically equivalent to [`mem::zeroed`].
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2837#[inline]
2838#[target_feature(enable = "avx")]
2839// This intrinsic has no corresponding instruction.
2840#[stable(feature = "simd_x86", since = "1.27.0")]
2841pub fn _mm256_undefined_pd() -> __m256d {
2842    const { unsafe { mem::zeroed() } }
2843}
2844
2845/// Returns vector of type __m256i with with indeterminate elements.
2846/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2847/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2848/// In practice, this is typically equivalent to [`mem::zeroed`].
2849///
2850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2851#[inline]
2852#[target_feature(enable = "avx")]
2853// This intrinsic has no corresponding instruction.
2854#[stable(feature = "simd_x86", since = "1.27.0")]
2855pub fn _mm256_undefined_si256() -> __m256i {
2856    const { unsafe { mem::zeroed() } }
2857}
2858
2859/// Sets packed __m256 returned vector with the supplied values.
2860///
2861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2862#[inline]
2863#[target_feature(enable = "avx")]
2864#[cfg_attr(test, assert_instr(vinsertf128))]
2865#[stable(feature = "simd_x86", since = "1.27.0")]
2866pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2867    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2868}
2869
2870/// Sets packed __m256d returned vector with the supplied values.
2871///
2872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2873#[inline]
2874#[target_feature(enable = "avx")]
2875#[cfg_attr(test, assert_instr(vinsertf128))]
2876#[stable(feature = "simd_x86", since = "1.27.0")]
2877pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2878    unsafe {
2879        let hi: __m128 = transmute(hi);
2880        let lo: __m128 = transmute(lo);
2881        transmute(_mm256_set_m128(hi, lo))
2882    }
2883}
2884
2885/// Sets packed __m256i returned vector with the supplied values.
2886///
2887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2888#[inline]
2889#[target_feature(enable = "avx")]
2890#[cfg_attr(test, assert_instr(vinsertf128))]
2891#[stable(feature = "simd_x86", since = "1.27.0")]
2892pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2893    unsafe {
2894        let hi: __m128 = transmute(hi);
2895        let lo: __m128 = transmute(lo);
2896        transmute(_mm256_set_m128(hi, lo))
2897    }
2898}
2899
2900/// Sets packed __m256 returned vector with the supplied values.
2901///
2902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2903#[inline]
2904#[target_feature(enable = "avx")]
2905#[cfg_attr(test, assert_instr(vinsertf128))]
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2908    _mm256_set_m128(hi, lo)
2909}
2910
2911/// Sets packed __m256d returned vector with the supplied values.
2912///
2913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2914#[inline]
2915#[target_feature(enable = "avx")]
2916#[cfg_attr(test, assert_instr(vinsertf128))]
2917#[stable(feature = "simd_x86", since = "1.27.0")]
2918pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2919    _mm256_set_m128d(hi, lo)
2920}
2921
2922/// Sets packed __m256i returned vector with the supplied values.
2923///
2924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
2925#[inline]
2926#[target_feature(enable = "avx")]
2927#[cfg_attr(test, assert_instr(vinsertf128))]
2928#[stable(feature = "simd_x86", since = "1.27.0")]
2929pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2930    _mm256_set_m128i(hi, lo)
2931}
2932
2933/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
2934/// floating-point elements) from memory, and combine them into a 256-bit
2935/// value.
2936/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2937///
2938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
2939#[inline]
2940#[target_feature(enable = "avx")]
2941// This intrinsic has no corresponding instruction.
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
2944    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
2945    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
2946}
2947
2948/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
2949/// floating-point elements) from memory, and combine them into a 256-bit
2950/// value.
2951/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2952///
2953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
2954#[inline]
2955#[target_feature(enable = "avx")]
2956// This intrinsic has no corresponding instruction.
2957#[stable(feature = "simd_x86", since = "1.27.0")]
2958pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
2959    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
2960    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
2961}
2962
2963/// Loads two 128-bit values (composed of integer data) from memory, and combine
2964/// them into a 256-bit value.
2965/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2966///
2967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
2968#[inline]
2969#[target_feature(enable = "avx")]
2970// This intrinsic has no corresponding instruction.
2971#[stable(feature = "simd_x86", since = "1.27.0")]
2972pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
2973    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
2974    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
2975}
2976
2977/// Stores the high and low 128-bit halves (each composed of 4 packed
2978/// single-precision (32-bit) floating-point elements) from `a` into memory two
2979/// different 128-bit locations.
2980/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2981///
2982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
2983#[inline]
2984#[target_feature(enable = "avx")]
2985// This intrinsic has no corresponding instruction.
2986#[stable(feature = "simd_x86", since = "1.27.0")]
2987pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
2988    let lo = _mm256_castps256_ps128(a);
2989    _mm_storeu_ps(loaddr, lo);
2990    let hi = _mm256_extractf128_ps::<1>(a);
2991    _mm_storeu_ps(hiaddr, hi);
2992}
2993
2994/// Stores the high and low 128-bit halves (each composed of 2 packed
2995/// double-precision (64-bit) floating-point elements) from `a` into memory two
2996/// different 128-bit locations.
2997/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2998///
2999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3000#[inline]
3001#[target_feature(enable = "avx")]
3002// This intrinsic has no corresponding instruction.
3003#[stable(feature = "simd_x86", since = "1.27.0")]
3004pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3005    let lo = _mm256_castpd256_pd128(a);
3006    _mm_storeu_pd(loaddr, lo);
3007    let hi = _mm256_extractf128_pd::<1>(a);
3008    _mm_storeu_pd(hiaddr, hi);
3009}
3010
3011/// Stores the high and low 128-bit halves (each composed of integer data) from
3012/// `a` into memory two different 128-bit locations.
3013/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3014///
3015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3016#[inline]
3017#[target_feature(enable = "avx")]
3018// This intrinsic has no corresponding instruction.
3019#[stable(feature = "simd_x86", since = "1.27.0")]
3020pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3021    let lo = _mm256_castsi256_si128(a);
3022    _mm_storeu_si128(loaddr, lo);
3023    let hi = _mm256_extractf128_si256::<1>(a);
3024    _mm_storeu_si128(hiaddr, hi);
3025}
3026
3027/// Returns the first element of the input vector of `[8 x float]`.
3028///
3029/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3030#[inline]
3031#[target_feature(enable = "avx")]
3032//#[cfg_attr(test, assert_instr(movss))] FIXME
3033#[stable(feature = "simd_x86", since = "1.27.0")]
3034pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3035    unsafe { simd_extract!(a, 0) }
3036}
3037
3038// LLVM intrinsics used in the above functions
3039#[allow(improper_ctypes)]
3040unsafe extern "C" {
3041    #[link_name = "llvm.x86.avx.round.pd.256"]
3042    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3043    #[link_name = "llvm.x86.avx.round.ps.256"]
3044    fn roundps256(a: __m256, b: i32) -> __m256;
3045    #[link_name = "llvm.x86.avx.dp.ps.256"]
3046    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3047    #[link_name = "llvm.x86.avx.hadd.pd.256"]
3048    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
3049    #[link_name = "llvm.x86.avx.hadd.ps.256"]
3050    fn vhaddps(a: __m256, b: __m256) -> __m256;
3051    #[link_name = "llvm.x86.avx.hsub.pd.256"]
3052    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
3053    #[link_name = "llvm.x86.avx.hsub.ps.256"]
3054    fn vhsubps(a: __m256, b: __m256) -> __m256;
3055    #[link_name = "llvm.x86.sse2.cmp.pd"]
3056    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3057    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3058    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3059    #[link_name = "llvm.x86.sse.cmp.ps"]
3060    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3061    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3062    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3063    #[link_name = "llvm.x86.sse2.cmp.sd"]
3064    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3065    #[link_name = "llvm.x86.sse.cmp.ss"]
3066    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3067    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3068    fn vcvtps2dq(a: __m256) -> i32x8;
3069    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3070    fn vcvttpd2dq(a: __m256d) -> i32x4;
3071    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3072    fn vcvtpd2dq(a: __m256d) -> i32x4;
3073    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3074    fn vcvttps2dq(a: __m256) -> i32x8;
3075    #[link_name = "llvm.x86.avx.vzeroall"]
3076    fn vzeroall();
3077    #[link_name = "llvm.x86.avx.vzeroupper"]
3078    fn vzeroupper();
3079    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3080    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3081    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3082    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3083    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3084    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3085    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3086    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3087    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
3088    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
3089    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
3090    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
3091    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
3092    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
3093    #[link_name = "llvm.x86.avx.maskload.pd.256"]
3094    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3095    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
3096    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3097    #[link_name = "llvm.x86.avx.maskload.pd"]
3098    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3099    #[link_name = "llvm.x86.avx.maskstore.pd"]
3100    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3101    #[link_name = "llvm.x86.avx.maskload.ps.256"]
3102    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3103    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
3104    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3105    #[link_name = "llvm.x86.avx.maskload.ps"]
3106    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3107    #[link_name = "llvm.x86.avx.maskstore.ps"]
3108    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
3109    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3110    fn vlddqu(mem_addr: *const i8) -> i8x32;
3111    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3112    fn vrcpps(a: __m256) -> __m256;
3113    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3114    fn vrsqrtps(a: __m256) -> __m256;
3115    #[link_name = "llvm.x86.avx.ptestz.256"]
3116    fn ptestz256(a: i64x4, b: i64x4) -> i32;
3117    #[link_name = "llvm.x86.avx.ptestc.256"]
3118    fn ptestc256(a: i64x4, b: i64x4) -> i32;
3119    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3120    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3121    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3122    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3123    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3124    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3125    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3126    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3127    #[link_name = "llvm.x86.avx.vtestz.pd"]
3128    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
3129    #[link_name = "llvm.x86.avx.vtestc.pd"]
3130    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
3131    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3132    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3133    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3134    fn vtestzps256(a: __m256, b: __m256) -> i32;
3135    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3136    fn vtestcps256(a: __m256, b: __m256) -> i32;
3137    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3138    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3139    #[link_name = "llvm.x86.avx.vtestz.ps"]
3140    fn vtestzps(a: __m128, b: __m128) -> i32;
3141    #[link_name = "llvm.x86.avx.vtestc.ps"]
3142    fn vtestcps(a: __m128, b: __m128) -> i32;
3143    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3144    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3145    #[link_name = "llvm.x86.avx.min.ps.256"]
3146    fn vminps(a: __m256, b: __m256) -> __m256;
3147    #[link_name = "llvm.x86.avx.max.ps.256"]
3148    fn vmaxps(a: __m256, b: __m256) -> __m256;
3149    #[link_name = "llvm.x86.avx.min.pd.256"]
3150    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3151    #[link_name = "llvm.x86.avx.max.pd.256"]
3152    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3153}
3154
3155#[cfg(test)]
3156mod tests {
3157    use crate::hint::black_box;
3158    use crate::ptr;
3159    use stdarch_test::simd_test;
3160
3161    use crate::core_arch::x86::*;
3162
3163    #[simd_test(enable = "avx")]
3164    unsafe fn test_mm256_add_pd() {
3165        let a = _mm256_setr_pd(1., 2., 3., 4.);
3166        let b = _mm256_setr_pd(5., 6., 7., 8.);
3167        let r = _mm256_add_pd(a, b);
3168        let e = _mm256_setr_pd(6., 8., 10., 12.);
3169        assert_eq_m256d(r, e);
3170    }
3171
3172    #[simd_test(enable = "avx")]
3173    unsafe fn test_mm256_add_ps() {
3174        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3175        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3176        let r = _mm256_add_ps(a, b);
3177        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3178        assert_eq_m256(r, e);
3179    }
3180
3181    #[simd_test(enable = "avx")]
3182    unsafe fn test_mm256_and_pd() {
3183        let a = _mm256_set1_pd(1.);
3184        let b = _mm256_set1_pd(0.6);
3185        let r = _mm256_and_pd(a, b);
3186        let e = _mm256_set1_pd(0.5);
3187        assert_eq_m256d(r, e);
3188    }
3189
3190    #[simd_test(enable = "avx")]
3191    unsafe fn test_mm256_and_ps() {
3192        let a = _mm256_set1_ps(1.);
3193        let b = _mm256_set1_ps(0.6);
3194        let r = _mm256_and_ps(a, b);
3195        let e = _mm256_set1_ps(0.5);
3196        assert_eq_m256(r, e);
3197    }
3198
3199    #[simd_test(enable = "avx")]
3200    unsafe fn test_mm256_or_pd() {
3201        let a = _mm256_set1_pd(1.);
3202        let b = _mm256_set1_pd(0.6);
3203        let r = _mm256_or_pd(a, b);
3204        let e = _mm256_set1_pd(1.2);
3205        assert_eq_m256d(r, e);
3206    }
3207
3208    #[simd_test(enable = "avx")]
3209    unsafe fn test_mm256_or_ps() {
3210        let a = _mm256_set1_ps(1.);
3211        let b = _mm256_set1_ps(0.6);
3212        let r = _mm256_or_ps(a, b);
3213        let e = _mm256_set1_ps(1.2);
3214        assert_eq_m256(r, e);
3215    }
3216
3217    #[simd_test(enable = "avx")]
3218    unsafe fn test_mm256_shuffle_pd() {
3219        let a = _mm256_setr_pd(1., 4., 5., 8.);
3220        let b = _mm256_setr_pd(2., 3., 6., 7.);
3221        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3222        let e = _mm256_setr_pd(4., 3., 8., 7.);
3223        assert_eq_m256d(r, e);
3224    }
3225
3226    #[simd_test(enable = "avx")]
3227    unsafe fn test_mm256_shuffle_ps() {
3228        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3229        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3230        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3231        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3232        assert_eq_m256(r, e);
3233    }
3234
3235    #[simd_test(enable = "avx")]
3236    unsafe fn test_mm256_andnot_pd() {
3237        let a = _mm256_set1_pd(0.);
3238        let b = _mm256_set1_pd(0.6);
3239        let r = _mm256_andnot_pd(a, b);
3240        assert_eq_m256d(r, b);
3241    }
3242
3243    #[simd_test(enable = "avx")]
3244    unsafe fn test_mm256_andnot_ps() {
3245        let a = _mm256_set1_ps(0.);
3246        let b = _mm256_set1_ps(0.6);
3247        let r = _mm256_andnot_ps(a, b);
3248        assert_eq_m256(r, b);
3249    }
3250
3251    #[simd_test(enable = "avx")]
3252    unsafe fn test_mm256_max_pd() {
3253        let a = _mm256_setr_pd(1., 4., 5., 8.);
3254        let b = _mm256_setr_pd(2., 3., 6., 7.);
3255        let r = _mm256_max_pd(a, b);
3256        let e = _mm256_setr_pd(2., 4., 6., 8.);
3257        assert_eq_m256d(r, e);
3258        // > If the values being compared are both 0.0s (of either sign), the
3259        // > value in the second operand (source operand) is returned.
3260        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3261        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3262        let wu: [u64; 4] = transmute(w);
3263        let xu: [u64; 4] = transmute(x);
3264        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3265        assert_eq!(xu, [0u64; 4]);
3266        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3267        // > second operand (source operand), either a NaN or a valid
3268        // > floating-point value, is written to the result.
3269        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3270        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3271        let yf: [f64; 4] = transmute(y);
3272        let zf: [f64; 4] = transmute(z);
3273        assert_eq!(yf, [0.0; 4]);
3274        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3275    }
3276
3277    #[simd_test(enable = "avx")]
3278    unsafe fn test_mm256_max_ps() {
3279        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3280        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3281        let r = _mm256_max_ps(a, b);
3282        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3283        assert_eq_m256(r, e);
3284        // > If the values being compared are both 0.0s (of either sign), the
3285        // > value in the second operand (source operand) is returned.
3286        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3287        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3288        let wu: [u32; 8] = transmute(w);
3289        let xu: [u32; 8] = transmute(x);
3290        assert_eq!(wu, [0x8000_0000u32; 8]);
3291        assert_eq!(xu, [0u32; 8]);
3292        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3293        // > second operand (source operand), either a NaN or a valid
3294        // > floating-point value, is written to the result.
3295        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3296        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3297        let yf: [f32; 8] = transmute(y);
3298        let zf: [f32; 8] = transmute(z);
3299        assert_eq!(yf, [0.0; 8]);
3300        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3301    }
3302
3303    #[simd_test(enable = "avx")]
3304    unsafe fn test_mm256_min_pd() {
3305        let a = _mm256_setr_pd(1., 4., 5., 8.);
3306        let b = _mm256_setr_pd(2., 3., 6., 7.);
3307        let r = _mm256_min_pd(a, b);
3308        let e = _mm256_setr_pd(1., 3., 5., 7.);
3309        assert_eq_m256d(r, e);
3310        // > If the values being compared are both 0.0s (of either sign), the
3311        // > value in the second operand (source operand) is returned.
3312        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3313        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3314        let wu: [u64; 4] = transmute(w);
3315        let xu: [u64; 4] = transmute(x);
3316        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3317        assert_eq!(xu, [0u64; 4]);
3318        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3319        // > second operand (source operand), either a NaN or a valid
3320        // > floating-point value, is written to the result.
3321        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3322        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3323        let yf: [f64; 4] = transmute(y);
3324        let zf: [f64; 4] = transmute(z);
3325        assert_eq!(yf, [0.0; 4]);
3326        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3327    }
3328
3329    #[simd_test(enable = "avx")]
3330    unsafe fn test_mm256_min_ps() {
3331        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3332        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3333        let r = _mm256_min_ps(a, b);
3334        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3335        assert_eq_m256(r, e);
3336        // > If the values being compared are both 0.0s (of either sign), the
3337        // > value in the second operand (source operand) is returned.
3338        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3339        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3340        let wu: [u32; 8] = transmute(w);
3341        let xu: [u32; 8] = transmute(x);
3342        assert_eq!(wu, [0x8000_0000u32; 8]);
3343        assert_eq!(xu, [0u32; 8]);
3344        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3345        // > second operand (source operand), either a NaN or a valid
3346        // > floating-point value, is written to the result.
3347        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3348        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3349        let yf: [f32; 8] = transmute(y);
3350        let zf: [f32; 8] = transmute(z);
3351        assert_eq!(yf, [0.0; 8]);
3352        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3353    }
3354
3355    #[simd_test(enable = "avx")]
3356    unsafe fn test_mm256_mul_pd() {
3357        let a = _mm256_setr_pd(1., 2., 3., 4.);
3358        let b = _mm256_setr_pd(5., 6., 7., 8.);
3359        let r = _mm256_mul_pd(a, b);
3360        let e = _mm256_setr_pd(5., 12., 21., 32.);
3361        assert_eq_m256d(r, e);
3362    }
3363
3364    #[simd_test(enable = "avx")]
3365    unsafe fn test_mm256_mul_ps() {
3366        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3367        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3368        let r = _mm256_mul_ps(a, b);
3369        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3370        assert_eq_m256(r, e);
3371    }
3372
3373    #[simd_test(enable = "avx")]
3374    unsafe fn test_mm256_addsub_pd() {
3375        let a = _mm256_setr_pd(1., 2., 3., 4.);
3376        let b = _mm256_setr_pd(5., 6., 7., 8.);
3377        let r = _mm256_addsub_pd(a, b);
3378        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3379        assert_eq_m256d(r, e);
3380    }
3381
3382    #[simd_test(enable = "avx")]
3383    unsafe fn test_mm256_addsub_ps() {
3384        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3385        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3386        let r = _mm256_addsub_ps(a, b);
3387        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3388        assert_eq_m256(r, e);
3389    }
3390
3391    #[simd_test(enable = "avx")]
3392    unsafe fn test_mm256_sub_pd() {
3393        let a = _mm256_setr_pd(1., 2., 3., 4.);
3394        let b = _mm256_setr_pd(5., 6., 7., 8.);
3395        let r = _mm256_sub_pd(a, b);
3396        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3397        assert_eq_m256d(r, e);
3398    }
3399
3400    #[simd_test(enable = "avx")]
3401    unsafe fn test_mm256_sub_ps() {
3402        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3403        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3404        let r = _mm256_sub_ps(a, b);
3405        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3406        assert_eq_m256(r, e);
3407    }
3408
3409    #[simd_test(enable = "avx")]
3410    unsafe fn test_mm256_round_pd() {
3411        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3412        let result_closest = _mm256_round_pd::<0b0000>(a);
3413        let result_down = _mm256_round_pd::<0b0001>(a);
3414        let result_up = _mm256_round_pd::<0b0010>(a);
3415        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3416        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3417        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3418        assert_eq_m256d(result_closest, expected_closest);
3419        assert_eq_m256d(result_down, expected_down);
3420        assert_eq_m256d(result_up, expected_up);
3421    }
3422
3423    #[simd_test(enable = "avx")]
3424    unsafe fn test_mm256_floor_pd() {
3425        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3426        let result_down = _mm256_floor_pd(a);
3427        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3428        assert_eq_m256d(result_down, expected_down);
3429    }
3430
3431    #[simd_test(enable = "avx")]
3432    unsafe fn test_mm256_ceil_pd() {
3433        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3434        let result_up = _mm256_ceil_pd(a);
3435        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3436        assert_eq_m256d(result_up, expected_up);
3437    }
3438
3439    #[simd_test(enable = "avx")]
3440    unsafe fn test_mm256_round_ps() {
3441        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3442        let result_closest = _mm256_round_ps::<0b0000>(a);
3443        let result_down = _mm256_round_ps::<0b0001>(a);
3444        let result_up = _mm256_round_ps::<0b0010>(a);
3445        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3446        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3447        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3448        assert_eq_m256(result_closest, expected_closest);
3449        assert_eq_m256(result_down, expected_down);
3450        assert_eq_m256(result_up, expected_up);
3451    }
3452
3453    #[simd_test(enable = "avx")]
3454    unsafe fn test_mm256_floor_ps() {
3455        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3456        let result_down = _mm256_floor_ps(a);
3457        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3458        assert_eq_m256(result_down, expected_down);
3459    }
3460
3461    #[simd_test(enable = "avx")]
3462    unsafe fn test_mm256_ceil_ps() {
3463        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3464        let result_up = _mm256_ceil_ps(a);
3465        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3466        assert_eq_m256(result_up, expected_up);
3467    }
3468
3469    #[simd_test(enable = "avx")]
3470    unsafe fn test_mm256_sqrt_pd() {
3471        let a = _mm256_setr_pd(4., 9., 16., 25.);
3472        let r = _mm256_sqrt_pd(a);
3473        let e = _mm256_setr_pd(2., 3., 4., 5.);
3474        assert_eq_m256d(r, e);
3475    }
3476
3477    #[simd_test(enable = "avx")]
3478    unsafe fn test_mm256_sqrt_ps() {
3479        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3480        let r = _mm256_sqrt_ps(a);
3481        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3482        assert_eq_m256(r, e);
3483    }
3484
3485    #[simd_test(enable = "avx")]
3486    unsafe fn test_mm256_div_ps() {
3487        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3488        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3489        let r = _mm256_div_ps(a, b);
3490        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3491        assert_eq_m256(r, e);
3492    }
3493
3494    #[simd_test(enable = "avx")]
3495    unsafe fn test_mm256_div_pd() {
3496        let a = _mm256_setr_pd(4., 9., 16., 25.);
3497        let b = _mm256_setr_pd(4., 3., 2., 5.);
3498        let r = _mm256_div_pd(a, b);
3499        let e = _mm256_setr_pd(1., 3., 8., 5.);
3500        assert_eq_m256d(r, e);
3501    }
3502
3503    #[simd_test(enable = "avx")]
3504    unsafe fn test_mm256_blend_pd() {
3505        let a = _mm256_setr_pd(4., 9., 16., 25.);
3506        let b = _mm256_setr_pd(4., 3., 2., 5.);
3507        let r = _mm256_blend_pd::<0x0>(a, b);
3508        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3509        let r = _mm256_blend_pd::<0x3>(a, b);
3510        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3511        let r = _mm256_blend_pd::<0xF>(a, b);
3512        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3513    }
3514
3515    #[simd_test(enable = "avx")]
3516    unsafe fn test_mm256_blend_ps() {
3517        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3518        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3519        let r = _mm256_blend_ps::<0x0>(a, b);
3520        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3521        let r = _mm256_blend_ps::<0x3>(a, b);
3522        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3523        let r = _mm256_blend_ps::<0xF>(a, b);
3524        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3525    }
3526
3527    #[simd_test(enable = "avx")]
3528    unsafe fn test_mm256_blendv_pd() {
3529        let a = _mm256_setr_pd(4., 9., 16., 25.);
3530        let b = _mm256_setr_pd(4., 3., 2., 5.);
3531        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3532        let r = _mm256_blendv_pd(a, b, c);
3533        let e = _mm256_setr_pd(4., 9., 2., 5.);
3534        assert_eq_m256d(r, e);
3535    }
3536
3537    #[simd_test(enable = "avx")]
3538    unsafe fn test_mm256_blendv_ps() {
3539        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3540        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3541        #[rustfmt::skip]
3542        let c = _mm256_setr_ps(
3543            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3544        );
3545        let r = _mm256_blendv_ps(a, b, c);
3546        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3547        assert_eq_m256(r, e);
3548    }
3549
3550    #[simd_test(enable = "avx")]
3551    unsafe fn test_mm256_dp_ps() {
3552        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3553        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3554        let r = _mm256_dp_ps::<0xFF>(a, b);
3555        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3556        assert_eq_m256(r, e);
3557    }
3558
3559    #[simd_test(enable = "avx")]
3560    unsafe fn test_mm256_hadd_pd() {
3561        let a = _mm256_setr_pd(4., 9., 16., 25.);
3562        let b = _mm256_setr_pd(4., 3., 2., 5.);
3563        let r = _mm256_hadd_pd(a, b);
3564        let e = _mm256_setr_pd(13., 7., 41., 7.);
3565        assert_eq_m256d(r, e);
3566
3567        let a = _mm256_setr_pd(1., 2., 3., 4.);
3568        let b = _mm256_setr_pd(5., 6., 7., 8.);
3569        let r = _mm256_hadd_pd(a, b);
3570        let e = _mm256_setr_pd(3., 11., 7., 15.);
3571        assert_eq_m256d(r, e);
3572    }
3573
3574    #[simd_test(enable = "avx")]
3575    unsafe fn test_mm256_hadd_ps() {
3576        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3577        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3578        let r = _mm256_hadd_ps(a, b);
3579        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3580        assert_eq_m256(r, e);
3581
3582        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3583        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3584        let r = _mm256_hadd_ps(a, b);
3585        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3586        assert_eq_m256(r, e);
3587    }
3588
3589    #[simd_test(enable = "avx")]
3590    unsafe fn test_mm256_hsub_pd() {
3591        let a = _mm256_setr_pd(4., 9., 16., 25.);
3592        let b = _mm256_setr_pd(4., 3., 2., 5.);
3593        let r = _mm256_hsub_pd(a, b);
3594        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3595        assert_eq_m256d(r, e);
3596
3597        let a = _mm256_setr_pd(1., 2., 3., 4.);
3598        let b = _mm256_setr_pd(5., 6., 7., 8.);
3599        let r = _mm256_hsub_pd(a, b);
3600        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3601        assert_eq_m256d(r, e);
3602    }
3603
3604    #[simd_test(enable = "avx")]
3605    unsafe fn test_mm256_hsub_ps() {
3606        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3607        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3608        let r = _mm256_hsub_ps(a, b);
3609        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3610        assert_eq_m256(r, e);
3611
3612        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3613        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3614        let r = _mm256_hsub_ps(a, b);
3615        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3616        assert_eq_m256(r, e);
3617    }
3618
3619    #[simd_test(enable = "avx")]
3620    unsafe fn test_mm256_xor_pd() {
3621        let a = _mm256_setr_pd(4., 9., 16., 25.);
3622        let b = _mm256_set1_pd(0.);
3623        let r = _mm256_xor_pd(a, b);
3624        assert_eq_m256d(r, a);
3625    }
3626
3627    #[simd_test(enable = "avx")]
3628    unsafe fn test_mm256_xor_ps() {
3629        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3630        let b = _mm256_set1_ps(0.);
3631        let r = _mm256_xor_ps(a, b);
3632        assert_eq_m256(r, a);
3633    }
3634
3635    #[simd_test(enable = "avx")]
3636    unsafe fn test_mm_cmp_pd() {
3637        let a = _mm_setr_pd(4., 9.);
3638        let b = _mm_setr_pd(4., 3.);
3639        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3640        assert!(get_m128d(r, 0).is_nan());
3641        assert!(get_m128d(r, 1).is_nan());
3642    }
3643
3644    #[simd_test(enable = "avx")]
3645    unsafe fn test_mm256_cmp_pd() {
3646        let a = _mm256_setr_pd(1., 2., 3., 4.);
3647        let b = _mm256_setr_pd(5., 6., 7., 8.);
3648        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3649        let e = _mm256_set1_pd(0.);
3650        assert_eq_m256d(r, e);
3651    }
3652
3653    #[simd_test(enable = "avx")]
3654    unsafe fn test_mm_cmp_ps() {
3655        let a = _mm_setr_ps(4., 3., 2., 5.);
3656        let b = _mm_setr_ps(4., 9., 16., 25.);
3657        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3658        assert!(get_m128(r, 0).is_nan());
3659        assert_eq!(get_m128(r, 1), 0.);
3660        assert_eq!(get_m128(r, 2), 0.);
3661        assert_eq!(get_m128(r, 3), 0.);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    unsafe fn test_mm256_cmp_ps() {
3666        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3667        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3668        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3669        let e = _mm256_set1_ps(0.);
3670        assert_eq_m256(r, e);
3671    }
3672
3673    #[simd_test(enable = "avx")]
3674    unsafe fn test_mm_cmp_sd() {
3675        let a = _mm_setr_pd(4., 9.);
3676        let b = _mm_setr_pd(4., 3.);
3677        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3678        assert!(get_m128d(r, 0).is_nan());
3679        assert_eq!(get_m128d(r, 1), 9.);
3680    }
3681
3682    #[simd_test(enable = "avx")]
3683    unsafe fn test_mm_cmp_ss() {
3684        let a = _mm_setr_ps(4., 3., 2., 5.);
3685        let b = _mm_setr_ps(4., 9., 16., 25.);
3686        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3687        assert!(get_m128(r, 0).is_nan());
3688        assert_eq!(get_m128(r, 1), 3.);
3689        assert_eq!(get_m128(r, 2), 2.);
3690        assert_eq!(get_m128(r, 3), 5.);
3691    }
3692
3693    #[simd_test(enable = "avx")]
3694    unsafe fn test_mm256_cvtepi32_pd() {
3695        let a = _mm_setr_epi32(4, 9, 16, 25);
3696        let r = _mm256_cvtepi32_pd(a);
3697        let e = _mm256_setr_pd(4., 9., 16., 25.);
3698        assert_eq_m256d(r, e);
3699    }
3700
3701    #[simd_test(enable = "avx")]
3702    unsafe fn test_mm256_cvtepi32_ps() {
3703        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3704        let r = _mm256_cvtepi32_ps(a);
3705        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3706        assert_eq_m256(r, e);
3707    }
3708
3709    #[simd_test(enable = "avx")]
3710    unsafe fn test_mm256_cvtpd_ps() {
3711        let a = _mm256_setr_pd(4., 9., 16., 25.);
3712        let r = _mm256_cvtpd_ps(a);
3713        let e = _mm_setr_ps(4., 9., 16., 25.);
3714        assert_eq_m128(r, e);
3715    }
3716
3717    #[simd_test(enable = "avx")]
3718    unsafe fn test_mm256_cvtps_epi32() {
3719        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3720        let r = _mm256_cvtps_epi32(a);
3721        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3722        assert_eq_m256i(r, e);
3723    }
3724
3725    #[simd_test(enable = "avx")]
3726    unsafe fn test_mm256_cvtps_pd() {
3727        let a = _mm_setr_ps(4., 9., 16., 25.);
3728        let r = _mm256_cvtps_pd(a);
3729        let e = _mm256_setr_pd(4., 9., 16., 25.);
3730        assert_eq_m256d(r, e);
3731    }
3732
3733    #[simd_test(enable = "avx")]
3734    unsafe fn test_mm256_cvtsd_f64() {
3735        let a = _mm256_setr_pd(1., 2., 3., 4.);
3736        let r = _mm256_cvtsd_f64(a);
3737        assert_eq!(r, 1.);
3738    }
3739
3740    #[simd_test(enable = "avx")]
3741    unsafe fn test_mm256_cvttpd_epi32() {
3742        let a = _mm256_setr_pd(4., 9., 16., 25.);
3743        let r = _mm256_cvttpd_epi32(a);
3744        let e = _mm_setr_epi32(4, 9, 16, 25);
3745        assert_eq_m128i(r, e);
3746    }
3747
3748    #[simd_test(enable = "avx")]
3749    unsafe fn test_mm256_cvtpd_epi32() {
3750        let a = _mm256_setr_pd(4., 9., 16., 25.);
3751        let r = _mm256_cvtpd_epi32(a);
3752        let e = _mm_setr_epi32(4, 9, 16, 25);
3753        assert_eq_m128i(r, e);
3754    }
3755
3756    #[simd_test(enable = "avx")]
3757    unsafe fn test_mm256_cvttps_epi32() {
3758        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3759        let r = _mm256_cvttps_epi32(a);
3760        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3761        assert_eq_m256i(r, e);
3762    }
3763
3764    #[simd_test(enable = "avx")]
3765    unsafe fn test_mm256_extractf128_ps() {
3766        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3767        let r = _mm256_extractf128_ps::<0>(a);
3768        let e = _mm_setr_ps(4., 3., 2., 5.);
3769        assert_eq_m128(r, e);
3770    }
3771
3772    #[simd_test(enable = "avx")]
3773    unsafe fn test_mm256_extractf128_pd() {
3774        let a = _mm256_setr_pd(4., 3., 2., 5.);
3775        let r = _mm256_extractf128_pd::<0>(a);
3776        let e = _mm_setr_pd(4., 3.);
3777        assert_eq_m128d(r, e);
3778    }
3779
3780    #[simd_test(enable = "avx")]
3781    unsafe fn test_mm256_extractf128_si256() {
3782        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3783        let r = _mm256_extractf128_si256::<0>(a);
3784        let e = _mm_setr_epi64x(4, 3);
3785        assert_eq_m128i(r, e);
3786    }
3787
3788    #[simd_test(enable = "avx")]
3789    unsafe fn test_mm256_extract_epi32() {
3790        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3791        let r1 = _mm256_extract_epi32::<0>(a);
3792        let r2 = _mm256_extract_epi32::<3>(a);
3793        assert_eq!(r1, -1);
3794        assert_eq!(r2, 3);
3795    }
3796
3797    #[simd_test(enable = "avx")]
3798    unsafe fn test_mm256_cvtsi256_si32() {
3799        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3800        let r = _mm256_cvtsi256_si32(a);
3801        assert_eq!(r, 1);
3802    }
3803
3804    #[simd_test(enable = "avx")]
3805    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3806    unsafe fn test_mm256_zeroall() {
3807        _mm256_zeroall();
3808    }
3809
3810    #[simd_test(enable = "avx")]
3811    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3812    unsafe fn test_mm256_zeroupper() {
3813        _mm256_zeroupper();
3814    }
3815
3816    #[simd_test(enable = "avx")]
3817    unsafe fn test_mm256_permutevar_ps() {
3818        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3819        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3820        let r = _mm256_permutevar_ps(a, b);
3821        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3822        assert_eq_m256(r, e);
3823    }
3824
3825    #[simd_test(enable = "avx")]
3826    unsafe fn test_mm_permutevar_ps() {
3827        let a = _mm_setr_ps(4., 3., 2., 5.);
3828        let b = _mm_setr_epi32(1, 2, 3, 4);
3829        let r = _mm_permutevar_ps(a, b);
3830        let e = _mm_setr_ps(3., 2., 5., 4.);
3831        assert_eq_m128(r, e);
3832    }
3833
3834    #[simd_test(enable = "avx")]
3835    unsafe fn test_mm256_permute_ps() {
3836        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3837        let r = _mm256_permute_ps::<0x1b>(a);
3838        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3839        assert_eq_m256(r, e);
3840    }
3841
3842    #[simd_test(enable = "avx")]
3843    unsafe fn test_mm_permute_ps() {
3844        let a = _mm_setr_ps(4., 3., 2., 5.);
3845        let r = _mm_permute_ps::<0x1b>(a);
3846        let e = _mm_setr_ps(5., 2., 3., 4.);
3847        assert_eq_m128(r, e);
3848    }
3849
3850    #[simd_test(enable = "avx")]
3851    unsafe fn test_mm256_permutevar_pd() {
3852        let a = _mm256_setr_pd(4., 3., 2., 5.);
3853        let b = _mm256_setr_epi64x(1, 2, 3, 4);
3854        let r = _mm256_permutevar_pd(a, b);
3855        let e = _mm256_setr_pd(4., 3., 5., 2.);
3856        assert_eq_m256d(r, e);
3857    }
3858
3859    #[simd_test(enable = "avx")]
3860    unsafe fn test_mm_permutevar_pd() {
3861        let a = _mm_setr_pd(4., 3.);
3862        let b = _mm_setr_epi64x(3, 0);
3863        let r = _mm_permutevar_pd(a, b);
3864        let e = _mm_setr_pd(3., 4.);
3865        assert_eq_m128d(r, e);
3866    }
3867
3868    #[simd_test(enable = "avx")]
3869    unsafe fn test_mm256_permute_pd() {
3870        let a = _mm256_setr_pd(4., 3., 2., 5.);
3871        let r = _mm256_permute_pd::<5>(a);
3872        let e = _mm256_setr_pd(3., 4., 5., 2.);
3873        assert_eq_m256d(r, e);
3874    }
3875
3876    #[simd_test(enable = "avx")]
3877    unsafe fn test_mm_permute_pd() {
3878        let a = _mm_setr_pd(4., 3.);
3879        let r = _mm_permute_pd::<1>(a);
3880        let e = _mm_setr_pd(3., 4.);
3881        assert_eq_m128d(r, e);
3882    }
3883
3884    #[simd_test(enable = "avx")]
3885    unsafe fn test_mm256_permute2f128_ps() {
3886        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3887        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3888        let r = _mm256_permute2f128_ps::<0x13>(a, b);
3889        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3890        assert_eq_m256(r, e);
3891    }
3892
3893    #[simd_test(enable = "avx")]
3894    unsafe fn test_mm256_permute2f128_pd() {
3895        let a = _mm256_setr_pd(1., 2., 3., 4.);
3896        let b = _mm256_setr_pd(5., 6., 7., 8.);
3897        let r = _mm256_permute2f128_pd::<0x31>(a, b);
3898        let e = _mm256_setr_pd(3., 4., 7., 8.);
3899        assert_eq_m256d(r, e);
3900    }
3901
3902    #[simd_test(enable = "avx")]
3903    unsafe fn test_mm256_permute2f128_si256() {
3904        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3905        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
3906        let r = _mm256_permute2f128_si256::<0x20>(a, b);
3907        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3908        assert_eq_m256i(r, e);
3909    }
3910
3911    #[simd_test(enable = "avx")]
3912    unsafe fn test_mm256_broadcast_ss() {
3913        let r = _mm256_broadcast_ss(&3.);
3914        let e = _mm256_set1_ps(3.);
3915        assert_eq_m256(r, e);
3916    }
3917
3918    #[simd_test(enable = "avx")]
3919    unsafe fn test_mm_broadcast_ss() {
3920        let r = _mm_broadcast_ss(&3.);
3921        let e = _mm_set1_ps(3.);
3922        assert_eq_m128(r, e);
3923    }
3924
3925    #[simd_test(enable = "avx")]
3926    unsafe fn test_mm256_broadcast_sd() {
3927        let r = _mm256_broadcast_sd(&3.);
3928        let e = _mm256_set1_pd(3.);
3929        assert_eq_m256d(r, e);
3930    }
3931
3932    #[simd_test(enable = "avx")]
3933    unsafe fn test_mm256_broadcast_ps() {
3934        let a = _mm_setr_ps(4., 3., 2., 5.);
3935        let r = _mm256_broadcast_ps(&a);
3936        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3937        assert_eq_m256(r, e);
3938    }
3939
3940    #[simd_test(enable = "avx")]
3941    unsafe fn test_mm256_broadcast_pd() {
3942        let a = _mm_setr_pd(4., 3.);
3943        let r = _mm256_broadcast_pd(&a);
3944        let e = _mm256_setr_pd(4., 3., 4., 3.);
3945        assert_eq_m256d(r, e);
3946    }
3947
3948    #[simd_test(enable = "avx")]
3949    unsafe fn test_mm256_insertf128_ps() {
3950        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3951        let b = _mm_setr_ps(4., 9., 16., 25.);
3952        let r = _mm256_insertf128_ps::<0>(a, b);
3953        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3954        assert_eq_m256(r, e);
3955    }
3956
3957    #[simd_test(enable = "avx")]
3958    unsafe fn test_mm256_insertf128_pd() {
3959        let a = _mm256_setr_pd(1., 2., 3., 4.);
3960        let b = _mm_setr_pd(5., 6.);
3961        let r = _mm256_insertf128_pd::<0>(a, b);
3962        let e = _mm256_setr_pd(5., 6., 3., 4.);
3963        assert_eq_m256d(r, e);
3964    }
3965
3966    #[simd_test(enable = "avx")]
3967    unsafe fn test_mm256_insertf128_si256() {
3968        let a = _mm256_setr_epi64x(1, 2, 3, 4);
3969        let b = _mm_setr_epi64x(5, 6);
3970        let r = _mm256_insertf128_si256::<0>(a, b);
3971        let e = _mm256_setr_epi64x(5, 6, 3, 4);
3972        assert_eq_m256i(r, e);
3973    }
3974
3975    #[simd_test(enable = "avx")]
3976    unsafe fn test_mm256_insert_epi8() {
3977        #[rustfmt::skip]
3978        let a = _mm256_setr_epi8(
3979            1, 2, 3, 4, 5, 6, 7, 8,
3980            9, 10, 11, 12, 13, 14, 15, 16,
3981            17, 18, 19, 20, 21, 22, 23, 24,
3982            25, 26, 27, 28, 29, 30, 31, 32,
3983        );
3984        let r = _mm256_insert_epi8::<31>(a, 0);
3985        #[rustfmt::skip]
3986        let e = _mm256_setr_epi8(
3987            1, 2, 3, 4, 5, 6, 7, 8,
3988            9, 10, 11, 12, 13, 14, 15, 16,
3989            17, 18, 19, 20, 21, 22, 23, 24,
3990            25, 26, 27, 28, 29, 30, 31, 0,
3991        );
3992        assert_eq_m256i(r, e);
3993    }
3994
3995    #[simd_test(enable = "avx")]
3996    unsafe fn test_mm256_insert_epi16() {
3997        #[rustfmt::skip]
3998        let a = _mm256_setr_epi16(
3999            0, 1, 2, 3, 4, 5, 6, 7,
4000            8, 9, 10, 11, 12, 13, 14, 15,
4001        );
4002        let r = _mm256_insert_epi16::<15>(a, 0);
4003        #[rustfmt::skip]
4004        let e = _mm256_setr_epi16(
4005            0, 1, 2, 3, 4, 5, 6, 7,
4006            8, 9, 10, 11, 12, 13, 14, 0,
4007        );
4008        assert_eq_m256i(r, e);
4009    }
4010
4011    #[simd_test(enable = "avx")]
4012    unsafe fn test_mm256_insert_epi32() {
4013        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4014        let r = _mm256_insert_epi32::<7>(a, 0);
4015        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4016        assert_eq_m256i(r, e);
4017    }
4018
4019    #[simd_test(enable = "avx")]
4020    unsafe fn test_mm256_load_pd() {
4021        let a = _mm256_setr_pd(1., 2., 3., 4.);
4022        let p = ptr::addr_of!(a) as *const f64;
4023        let r = _mm256_load_pd(p);
4024        let e = _mm256_setr_pd(1., 2., 3., 4.);
4025        assert_eq_m256d(r, e);
4026    }
4027
4028    #[simd_test(enable = "avx")]
4029    unsafe fn test_mm256_store_pd() {
4030        let a = _mm256_setr_pd(1., 2., 3., 4.);
4031        let mut r = _mm256_undefined_pd();
4032        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4033        assert_eq_m256d(r, a);
4034    }
4035
4036    #[simd_test(enable = "avx")]
4037    unsafe fn test_mm256_load_ps() {
4038        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4039        let p = ptr::addr_of!(a) as *const f32;
4040        let r = _mm256_load_ps(p);
4041        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4042        assert_eq_m256(r, e);
4043    }
4044
4045    #[simd_test(enable = "avx")]
4046    unsafe fn test_mm256_store_ps() {
4047        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4048        let mut r = _mm256_undefined_ps();
4049        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4050        assert_eq_m256(r, a);
4051    }
4052
4053    #[simd_test(enable = "avx")]
4054    unsafe fn test_mm256_loadu_pd() {
4055        let a = &[1.0f64, 2., 3., 4.];
4056        let p = a.as_ptr();
4057        let r = _mm256_loadu_pd(black_box(p));
4058        let e = _mm256_setr_pd(1., 2., 3., 4.);
4059        assert_eq_m256d(r, e);
4060    }
4061
4062    #[simd_test(enable = "avx")]
4063    unsafe fn test_mm256_storeu_pd() {
4064        let a = _mm256_set1_pd(9.);
4065        let mut r = _mm256_undefined_pd();
4066        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4067        assert_eq_m256d(r, a);
4068    }
4069
4070    #[simd_test(enable = "avx")]
4071    unsafe fn test_mm256_loadu_ps() {
4072        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4073        let p = a.as_ptr();
4074        let r = _mm256_loadu_ps(black_box(p));
4075        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4076        assert_eq_m256(r, e);
4077    }
4078
4079    #[simd_test(enable = "avx")]
4080    unsafe fn test_mm256_storeu_ps() {
4081        let a = _mm256_set1_ps(9.);
4082        let mut r = _mm256_undefined_ps();
4083        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4084        assert_eq_m256(r, a);
4085    }
4086
4087    #[simd_test(enable = "avx")]
4088    unsafe fn test_mm256_load_si256() {
4089        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4090        let p = ptr::addr_of!(a);
4091        let r = _mm256_load_si256(p);
4092        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4093        assert_eq_m256i(r, e);
4094    }
4095
4096    #[simd_test(enable = "avx")]
4097    unsafe fn test_mm256_store_si256() {
4098        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4099        let mut r = _mm256_undefined_si256();
4100        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4101        assert_eq_m256i(r, a);
4102    }
4103
4104    #[simd_test(enable = "avx")]
4105    unsafe fn test_mm256_loadu_si256() {
4106        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4107        let p = ptr::addr_of!(a);
4108        let r = _mm256_loadu_si256(black_box(p));
4109        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4110        assert_eq_m256i(r, e);
4111    }
4112
4113    #[simd_test(enable = "avx")]
4114    unsafe fn test_mm256_storeu_si256() {
4115        let a = _mm256_set1_epi8(9);
4116        let mut r = _mm256_undefined_si256();
4117        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4118        assert_eq_m256i(r, a);
4119    }
4120
4121    #[simd_test(enable = "avx")]
4122    unsafe fn test_mm256_maskload_pd() {
4123        let a = &[1.0f64, 2., 3., 4.];
4124        let p = a.as_ptr();
4125        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4126        let r = _mm256_maskload_pd(black_box(p), mask);
4127        let e = _mm256_setr_pd(0., 2., 0., 4.);
4128        assert_eq_m256d(r, e);
4129    }
4130
4131    #[simd_test(enable = "avx")]
4132    unsafe fn test_mm256_maskstore_pd() {
4133        let mut r = _mm256_set1_pd(0.);
4134        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4135        let a = _mm256_setr_pd(1., 2., 3., 4.);
4136        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4137        let e = _mm256_setr_pd(0., 2., 0., 4.);
4138        assert_eq_m256d(r, e);
4139    }
4140
4141    #[simd_test(enable = "avx")]
4142    unsafe fn test_mm_maskload_pd() {
4143        let a = &[1.0f64, 2.];
4144        let p = a.as_ptr();
4145        let mask = _mm_setr_epi64x(0, !0);
4146        let r = _mm_maskload_pd(black_box(p), mask);
4147        let e = _mm_setr_pd(0., 2.);
4148        assert_eq_m128d(r, e);
4149    }
4150
4151    #[simd_test(enable = "avx")]
4152    unsafe fn test_mm_maskstore_pd() {
4153        let mut r = _mm_set1_pd(0.);
4154        let mask = _mm_setr_epi64x(0, !0);
4155        let a = _mm_setr_pd(1., 2.);
4156        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4157        let e = _mm_setr_pd(0., 2.);
4158        assert_eq_m128d(r, e);
4159    }
4160
4161    #[simd_test(enable = "avx")]
4162    unsafe fn test_mm256_maskload_ps() {
4163        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4164        let p = a.as_ptr();
4165        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4166        let r = _mm256_maskload_ps(black_box(p), mask);
4167        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4168        assert_eq_m256(r, e);
4169    }
4170
4171    #[simd_test(enable = "avx")]
4172    unsafe fn test_mm256_maskstore_ps() {
4173        let mut r = _mm256_set1_ps(0.);
4174        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4175        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4176        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4177        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4178        assert_eq_m256(r, e);
4179    }
4180
4181    #[simd_test(enable = "avx")]
4182    unsafe fn test_mm_maskload_ps() {
4183        let a = &[1.0f32, 2., 3., 4.];
4184        let p = a.as_ptr();
4185        let mask = _mm_setr_epi32(0, !0, 0, !0);
4186        let r = _mm_maskload_ps(black_box(p), mask);
4187        let e = _mm_setr_ps(0., 2., 0., 4.);
4188        assert_eq_m128(r, e);
4189    }
4190
4191    #[simd_test(enable = "avx")]
4192    unsafe fn test_mm_maskstore_ps() {
4193        let mut r = _mm_set1_ps(0.);
4194        let mask = _mm_setr_epi32(0, !0, 0, !0);
4195        let a = _mm_setr_ps(1., 2., 3., 4.);
4196        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4197        let e = _mm_setr_ps(0., 2., 0., 4.);
4198        assert_eq_m128(r, e);
4199    }
4200
4201    #[simd_test(enable = "avx")]
4202    unsafe fn test_mm256_movehdup_ps() {
4203        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4204        let r = _mm256_movehdup_ps(a);
4205        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4206        assert_eq_m256(r, e);
4207    }
4208
4209    #[simd_test(enable = "avx")]
4210    unsafe fn test_mm256_moveldup_ps() {
4211        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4212        let r = _mm256_moveldup_ps(a);
4213        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4214        assert_eq_m256(r, e);
4215    }
4216
4217    #[simd_test(enable = "avx")]
4218    unsafe fn test_mm256_movedup_pd() {
4219        let a = _mm256_setr_pd(1., 2., 3., 4.);
4220        let r = _mm256_movedup_pd(a);
4221        let e = _mm256_setr_pd(1., 1., 3., 3.);
4222        assert_eq_m256d(r, e);
4223    }
4224
4225    #[simd_test(enable = "avx")]
4226    unsafe fn test_mm256_lddqu_si256() {
4227        #[rustfmt::skip]
4228        let a = _mm256_setr_epi8(
4229            1, 2, 3, 4, 5, 6, 7, 8,
4230            9, 10, 11, 12, 13, 14, 15, 16,
4231            17, 18, 19, 20, 21, 22, 23, 24,
4232            25, 26, 27, 28, 29, 30, 31, 32,
4233        );
4234        let p = ptr::addr_of!(a);
4235        let r = _mm256_lddqu_si256(black_box(p));
4236        #[rustfmt::skip]
4237        let e = _mm256_setr_epi8(
4238            1, 2, 3, 4, 5, 6, 7, 8,
4239            9, 10, 11, 12, 13, 14, 15, 16,
4240            17, 18, 19, 20, 21, 22, 23, 24,
4241            25, 26, 27, 28, 29, 30, 31, 32,
4242        );
4243        assert_eq_m256i(r, e);
4244    }
4245
4246    #[simd_test(enable = "avx")]
4247    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4248    unsafe fn test_mm256_stream_si256() {
4249        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4250        let mut r = _mm256_undefined_si256();
4251        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4252        assert_eq_m256i(r, a);
4253    }
4254
4255    #[simd_test(enable = "avx")]
4256    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4257    unsafe fn test_mm256_stream_pd() {
4258        #[repr(align(32))]
4259        struct Memory {
4260            pub data: [f64; 4],
4261        }
4262        let a = _mm256_set1_pd(7.0);
4263        let mut mem = Memory { data: [-1.0; 4] };
4264
4265        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4266        for i in 0..4 {
4267            assert_eq!(mem.data[i], get_m256d(a, i));
4268        }
4269    }
4270
4271    #[simd_test(enable = "avx")]
4272    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4273    unsafe fn test_mm256_stream_ps() {
4274        #[repr(align(32))]
4275        struct Memory {
4276            pub data: [f32; 8],
4277        }
4278        let a = _mm256_set1_ps(7.0);
4279        let mut mem = Memory { data: [-1.0; 8] };
4280
4281        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4282        for i in 0..8 {
4283            assert_eq!(mem.data[i], get_m256(a, i));
4284        }
4285    }
4286
4287    #[simd_test(enable = "avx")]
4288    unsafe fn test_mm256_rcp_ps() {
4289        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4290        let r = _mm256_rcp_ps(a);
4291        #[rustfmt::skip]
4292        let e = _mm256_setr_ps(
4293            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4294            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4295        );
4296        let rel_err = 0.00048828125;
4297        for i in 0..8 {
4298            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4299        }
4300    }
4301
4302    #[simd_test(enable = "avx")]
4303    unsafe fn test_mm256_rsqrt_ps() {
4304        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4305        let r = _mm256_rsqrt_ps(a);
4306        #[rustfmt::skip]
4307        let e = _mm256_setr_ps(
4308            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4309            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4310        );
4311        let rel_err = 0.00048828125;
4312        for i in 0..8 {
4313            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4314        }
4315    }
4316
4317    #[simd_test(enable = "avx")]
4318    unsafe fn test_mm256_unpackhi_pd() {
4319        let a = _mm256_setr_pd(1., 2., 3., 4.);
4320        let b = _mm256_setr_pd(5., 6., 7., 8.);
4321        let r = _mm256_unpackhi_pd(a, b);
4322        let e = _mm256_setr_pd(2., 6., 4., 8.);
4323        assert_eq_m256d(r, e);
4324    }
4325
4326    #[simd_test(enable = "avx")]
4327    unsafe fn test_mm256_unpackhi_ps() {
4328        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4329        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4330        let r = _mm256_unpackhi_ps(a, b);
4331        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4332        assert_eq_m256(r, e);
4333    }
4334
4335    #[simd_test(enable = "avx")]
4336    unsafe fn test_mm256_unpacklo_pd() {
4337        let a = _mm256_setr_pd(1., 2., 3., 4.);
4338        let b = _mm256_setr_pd(5., 6., 7., 8.);
4339        let r = _mm256_unpacklo_pd(a, b);
4340        let e = _mm256_setr_pd(1., 5., 3., 7.);
4341        assert_eq_m256d(r, e);
4342    }
4343
4344    #[simd_test(enable = "avx")]
4345    unsafe fn test_mm256_unpacklo_ps() {
4346        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4347        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4348        let r = _mm256_unpacklo_ps(a, b);
4349        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4350        assert_eq_m256(r, e);
4351    }
4352
4353    #[simd_test(enable = "avx")]
4354    unsafe fn test_mm256_testz_si256() {
4355        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4356        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4357        let r = _mm256_testz_si256(a, b);
4358        assert_eq!(r, 0);
4359        let b = _mm256_set1_epi64x(0);
4360        let r = _mm256_testz_si256(a, b);
4361        assert_eq!(r, 1);
4362    }
4363
4364    #[simd_test(enable = "avx")]
4365    unsafe fn test_mm256_testc_si256() {
4366        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4367        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4368        let r = _mm256_testc_si256(a, b);
4369        assert_eq!(r, 0);
4370        let b = _mm256_set1_epi64x(0);
4371        let r = _mm256_testc_si256(a, b);
4372        assert_eq!(r, 1);
4373    }
4374
4375    #[simd_test(enable = "avx")]
4376    unsafe fn test_mm256_testnzc_si256() {
4377        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4378        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4379        let r = _mm256_testnzc_si256(a, b);
4380        assert_eq!(r, 1);
4381        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4382        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4383        let r = _mm256_testnzc_si256(a, b);
4384        assert_eq!(r, 0);
4385    }
4386
4387    #[simd_test(enable = "avx")]
4388    unsafe fn test_mm256_testz_pd() {
4389        let a = _mm256_setr_pd(1., 2., 3., 4.);
4390        let b = _mm256_setr_pd(5., 6., 7., 8.);
4391        let r = _mm256_testz_pd(a, b);
4392        assert_eq!(r, 1);
4393        let a = _mm256_set1_pd(-1.);
4394        let r = _mm256_testz_pd(a, a);
4395        assert_eq!(r, 0);
4396    }
4397
4398    #[simd_test(enable = "avx")]
4399    unsafe fn test_mm256_testc_pd() {
4400        let a = _mm256_setr_pd(1., 2., 3., 4.);
4401        let b = _mm256_setr_pd(5., 6., 7., 8.);
4402        let r = _mm256_testc_pd(a, b);
4403        assert_eq!(r, 1);
4404        let a = _mm256_set1_pd(1.);
4405        let b = _mm256_set1_pd(-1.);
4406        let r = _mm256_testc_pd(a, b);
4407        assert_eq!(r, 0);
4408    }
4409
4410    #[simd_test(enable = "avx")]
4411    unsafe fn test_mm256_testnzc_pd() {
4412        let a = _mm256_setr_pd(1., 2., 3., 4.);
4413        let b = _mm256_setr_pd(5., 6., 7., 8.);
4414        let r = _mm256_testnzc_pd(a, b);
4415        assert_eq!(r, 0);
4416        let a = _mm256_setr_pd(1., -1., -1., -1.);
4417        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4418        let r = _mm256_testnzc_pd(a, b);
4419        assert_eq!(r, 1);
4420    }
4421
4422    #[simd_test(enable = "avx")]
4423    unsafe fn test_mm_testz_pd() {
4424        let a = _mm_setr_pd(1., 2.);
4425        let b = _mm_setr_pd(5., 6.);
4426        let r = _mm_testz_pd(a, b);
4427        assert_eq!(r, 1);
4428        let a = _mm_set1_pd(-1.);
4429        let r = _mm_testz_pd(a, a);
4430        assert_eq!(r, 0);
4431    }
4432
4433    #[simd_test(enable = "avx")]
4434    unsafe fn test_mm_testc_pd() {
4435        let a = _mm_setr_pd(1., 2.);
4436        let b = _mm_setr_pd(5., 6.);
4437        let r = _mm_testc_pd(a, b);
4438        assert_eq!(r, 1);
4439        let a = _mm_set1_pd(1.);
4440        let b = _mm_set1_pd(-1.);
4441        let r = _mm_testc_pd(a, b);
4442        assert_eq!(r, 0);
4443    }
4444
4445    #[simd_test(enable = "avx")]
4446    unsafe fn test_mm_testnzc_pd() {
4447        let a = _mm_setr_pd(1., 2.);
4448        let b = _mm_setr_pd(5., 6.);
4449        let r = _mm_testnzc_pd(a, b);
4450        assert_eq!(r, 0);
4451        let a = _mm_setr_pd(1., -1.);
4452        let b = _mm_setr_pd(-1., -1.);
4453        let r = _mm_testnzc_pd(a, b);
4454        assert_eq!(r, 1);
4455    }
4456
4457    #[simd_test(enable = "avx")]
4458    unsafe fn test_mm256_testz_ps() {
4459        let a = _mm256_set1_ps(1.);
4460        let r = _mm256_testz_ps(a, a);
4461        assert_eq!(r, 1);
4462        let a = _mm256_set1_ps(-1.);
4463        let r = _mm256_testz_ps(a, a);
4464        assert_eq!(r, 0);
4465    }
4466
4467    #[simd_test(enable = "avx")]
4468    unsafe fn test_mm256_testc_ps() {
4469        let a = _mm256_set1_ps(1.);
4470        let r = _mm256_testc_ps(a, a);
4471        assert_eq!(r, 1);
4472        let b = _mm256_set1_ps(-1.);
4473        let r = _mm256_testc_ps(a, b);
4474        assert_eq!(r, 0);
4475    }
4476
4477    #[simd_test(enable = "avx")]
4478    unsafe fn test_mm256_testnzc_ps() {
4479        let a = _mm256_set1_ps(1.);
4480        let r = _mm256_testnzc_ps(a, a);
4481        assert_eq!(r, 0);
4482        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4483        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4484        let r = _mm256_testnzc_ps(a, b);
4485        assert_eq!(r, 1);
4486    }
4487
4488    #[simd_test(enable = "avx")]
4489    unsafe fn test_mm_testz_ps() {
4490        let a = _mm_set1_ps(1.);
4491        let r = _mm_testz_ps(a, a);
4492        assert_eq!(r, 1);
4493        let a = _mm_set1_ps(-1.);
4494        let r = _mm_testz_ps(a, a);
4495        assert_eq!(r, 0);
4496    }
4497
4498    #[simd_test(enable = "avx")]
4499    unsafe fn test_mm_testc_ps() {
4500        let a = _mm_set1_ps(1.);
4501        let r = _mm_testc_ps(a, a);
4502        assert_eq!(r, 1);
4503        let b = _mm_set1_ps(-1.);
4504        let r = _mm_testc_ps(a, b);
4505        assert_eq!(r, 0);
4506    }
4507
4508    #[simd_test(enable = "avx")]
4509    unsafe fn test_mm_testnzc_ps() {
4510        let a = _mm_set1_ps(1.);
4511        let r = _mm_testnzc_ps(a, a);
4512        assert_eq!(r, 0);
4513        let a = _mm_setr_ps(1., -1., -1., -1.);
4514        let b = _mm_setr_ps(-1., -1., 1., 1.);
4515        let r = _mm_testnzc_ps(a, b);
4516        assert_eq!(r, 1);
4517    }
4518
4519    #[simd_test(enable = "avx")]
4520    unsafe fn test_mm256_movemask_pd() {
4521        let a = _mm256_setr_pd(1., -2., 3., -4.);
4522        let r = _mm256_movemask_pd(a);
4523        assert_eq!(r, 0xA);
4524    }
4525
4526    #[simd_test(enable = "avx")]
4527    unsafe fn test_mm256_movemask_ps() {
4528        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4529        let r = _mm256_movemask_ps(a);
4530        assert_eq!(r, 0xAA);
4531    }
4532
4533    #[simd_test(enable = "avx")]
4534    unsafe fn test_mm256_setzero_pd() {
4535        let r = _mm256_setzero_pd();
4536        assert_eq_m256d(r, _mm256_set1_pd(0.));
4537    }
4538
4539    #[simd_test(enable = "avx")]
4540    unsafe fn test_mm256_setzero_ps() {
4541        let r = _mm256_setzero_ps();
4542        assert_eq_m256(r, _mm256_set1_ps(0.));
4543    }
4544
4545    #[simd_test(enable = "avx")]
4546    unsafe fn test_mm256_setzero_si256() {
4547        let r = _mm256_setzero_si256();
4548        assert_eq_m256i(r, _mm256_set1_epi8(0));
4549    }
4550
4551    #[simd_test(enable = "avx")]
4552    unsafe fn test_mm256_set_pd() {
4553        let r = _mm256_set_pd(1., 2., 3., 4.);
4554        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4555    }
4556
4557    #[simd_test(enable = "avx")]
4558    unsafe fn test_mm256_set_ps() {
4559        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4560        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4561    }
4562
4563    #[simd_test(enable = "avx")]
4564    unsafe fn test_mm256_set_epi8() {
4565        #[rustfmt::skip]
4566        let r = _mm256_set_epi8(
4567            1, 2, 3, 4, 5, 6, 7, 8,
4568            9, 10, 11, 12, 13, 14, 15, 16,
4569            17, 18, 19, 20, 21, 22, 23, 24,
4570            25, 26, 27, 28, 29, 30, 31, 32,
4571        );
4572        #[rustfmt::skip]
4573        let e = _mm256_setr_epi8(
4574            32, 31, 30, 29, 28, 27, 26, 25,
4575            24, 23, 22, 21, 20, 19, 18, 17,
4576            16, 15, 14, 13, 12, 11, 10, 9,
4577            8, 7, 6, 5, 4, 3, 2, 1
4578        );
4579        assert_eq_m256i(r, e);
4580    }
4581
4582    #[simd_test(enable = "avx")]
4583    unsafe fn test_mm256_set_epi16() {
4584        #[rustfmt::skip]
4585        let r = _mm256_set_epi16(
4586            1, 2, 3, 4, 5, 6, 7, 8,
4587            9, 10, 11, 12, 13, 14, 15, 16,
4588        );
4589        #[rustfmt::skip]
4590        let e = _mm256_setr_epi16(
4591            16, 15, 14, 13, 12, 11, 10, 9, 8,
4592            7, 6, 5, 4, 3, 2, 1,
4593        );
4594        assert_eq_m256i(r, e);
4595    }
4596
4597    #[simd_test(enable = "avx")]
4598    unsafe fn test_mm256_set_epi32() {
4599        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4600        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4601    }
4602
4603    #[simd_test(enable = "avx")]
4604    unsafe fn test_mm256_set_epi64x() {
4605        let r = _mm256_set_epi64x(1, 2, 3, 4);
4606        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4607    }
4608
4609    #[simd_test(enable = "avx")]
4610    unsafe fn test_mm256_setr_pd() {
4611        let r = _mm256_setr_pd(1., 2., 3., 4.);
4612        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4613    }
4614
4615    #[simd_test(enable = "avx")]
4616    unsafe fn test_mm256_setr_ps() {
4617        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4618        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4619    }
4620
4621    #[simd_test(enable = "avx")]
4622    unsafe fn test_mm256_setr_epi8() {
4623        #[rustfmt::skip]
4624        let r = _mm256_setr_epi8(
4625            1, 2, 3, 4, 5, 6, 7, 8,
4626            9, 10, 11, 12, 13, 14, 15, 16,
4627            17, 18, 19, 20, 21, 22, 23, 24,
4628            25, 26, 27, 28, 29, 30, 31, 32,
4629        );
4630        #[rustfmt::skip]
4631        let e = _mm256_setr_epi8(
4632            1, 2, 3, 4, 5, 6, 7, 8,
4633            9, 10, 11, 12, 13, 14, 15, 16,
4634            17, 18, 19, 20, 21, 22, 23, 24,
4635            25, 26, 27, 28, 29, 30, 31, 32
4636        );
4637
4638        assert_eq_m256i(r, e);
4639    }
4640
4641    #[simd_test(enable = "avx")]
4642    unsafe fn test_mm256_setr_epi16() {
4643        #[rustfmt::skip]
4644        let r = _mm256_setr_epi16(
4645            1, 2, 3, 4, 5, 6, 7, 8,
4646            9, 10, 11, 12, 13, 14, 15, 16,
4647        );
4648        #[rustfmt::skip]
4649        let e = _mm256_setr_epi16(
4650            1, 2, 3, 4, 5, 6, 7, 8,
4651            9, 10, 11, 12, 13, 14, 15, 16,
4652        );
4653        assert_eq_m256i(r, e);
4654    }
4655
4656    #[simd_test(enable = "avx")]
4657    unsafe fn test_mm256_setr_epi32() {
4658        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4659        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4660    }
4661
4662    #[simd_test(enable = "avx")]
4663    unsafe fn test_mm256_setr_epi64x() {
4664        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4665        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4666    }
4667
4668    #[simd_test(enable = "avx")]
4669    unsafe fn test_mm256_set1_pd() {
4670        let r = _mm256_set1_pd(1.);
4671        assert_eq_m256d(r, _mm256_set1_pd(1.));
4672    }
4673
4674    #[simd_test(enable = "avx")]
4675    unsafe fn test_mm256_set1_ps() {
4676        let r = _mm256_set1_ps(1.);
4677        assert_eq_m256(r, _mm256_set1_ps(1.));
4678    }
4679
4680    #[simd_test(enable = "avx")]
4681    unsafe fn test_mm256_set1_epi8() {
4682        let r = _mm256_set1_epi8(1);
4683        assert_eq_m256i(r, _mm256_set1_epi8(1));
4684    }
4685
4686    #[simd_test(enable = "avx")]
4687    unsafe fn test_mm256_set1_epi16() {
4688        let r = _mm256_set1_epi16(1);
4689        assert_eq_m256i(r, _mm256_set1_epi16(1));
4690    }
4691
4692    #[simd_test(enable = "avx")]
4693    unsafe fn test_mm256_set1_epi32() {
4694        let r = _mm256_set1_epi32(1);
4695        assert_eq_m256i(r, _mm256_set1_epi32(1));
4696    }
4697
4698    #[simd_test(enable = "avx")]
4699    unsafe fn test_mm256_set1_epi64x() {
4700        let r = _mm256_set1_epi64x(1);
4701        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4702    }
4703
4704    #[simd_test(enable = "avx")]
4705    unsafe fn test_mm256_castpd_ps() {
4706        let a = _mm256_setr_pd(1., 2., 3., 4.);
4707        let r = _mm256_castpd_ps(a);
4708        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4709        assert_eq_m256(r, e);
4710    }
4711
4712    #[simd_test(enable = "avx")]
4713    unsafe fn test_mm256_castps_pd() {
4714        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4715        let r = _mm256_castps_pd(a);
4716        let e = _mm256_setr_pd(1., 2., 3., 4.);
4717        assert_eq_m256d(r, e);
4718    }
4719
4720    #[simd_test(enable = "avx")]
4721    unsafe fn test_mm256_castps_si256() {
4722        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4723        let r = _mm256_castps_si256(a);
4724        #[rustfmt::skip]
4725        let e = _mm256_setr_epi8(
4726            0, 0, -128, 63, 0, 0, 0, 64,
4727            0, 0, 64, 64, 0, 0, -128, 64,
4728            0, 0, -96, 64, 0, 0, -64, 64,
4729            0, 0, -32, 64, 0, 0, 0, 65,
4730        );
4731        assert_eq_m256i(r, e);
4732    }
4733
4734    #[simd_test(enable = "avx")]
4735    unsafe fn test_mm256_castsi256_ps() {
4736        #[rustfmt::skip]
4737        let a = _mm256_setr_epi8(
4738            0, 0, -128, 63, 0, 0, 0, 64,
4739            0, 0, 64, 64, 0, 0, -128, 64,
4740            0, 0, -96, 64, 0, 0, -64, 64,
4741            0, 0, -32, 64, 0, 0, 0, 65,
4742        );
4743        let r = _mm256_castsi256_ps(a);
4744        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4745        assert_eq_m256(r, e);
4746    }
4747
4748    #[simd_test(enable = "avx")]
4749    unsafe fn test_mm256_castpd_si256() {
4750        let a = _mm256_setr_pd(1., 2., 3., 4.);
4751        let r = _mm256_castpd_si256(a);
4752        assert_eq_m256d(transmute(r), a);
4753    }
4754
4755    #[simd_test(enable = "avx")]
4756    unsafe fn test_mm256_castsi256_pd() {
4757        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4758        let r = _mm256_castsi256_pd(a);
4759        assert_eq_m256d(r, transmute(a));
4760    }
4761
4762    #[simd_test(enable = "avx")]
4763    unsafe fn test_mm256_castps256_ps128() {
4764        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4765        let r = _mm256_castps256_ps128(a);
4766        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4767    }
4768
4769    #[simd_test(enable = "avx")]
4770    unsafe fn test_mm256_castpd256_pd128() {
4771        let a = _mm256_setr_pd(1., 2., 3., 4.);
4772        let r = _mm256_castpd256_pd128(a);
4773        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4774    }
4775
4776    #[simd_test(enable = "avx")]
4777    unsafe fn test_mm256_castsi256_si128() {
4778        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4779        let r = _mm256_castsi256_si128(a);
4780        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4781    }
4782
4783    #[simd_test(enable = "avx")]
4784    unsafe fn test_mm256_castps128_ps256() {
4785        let a = _mm_setr_ps(1., 2., 3., 4.);
4786        let r = _mm256_castps128_ps256(a);
4787        assert_eq_m128(_mm256_castps256_ps128(r), a);
4788    }
4789
4790    #[simd_test(enable = "avx")]
4791    unsafe fn test_mm256_castpd128_pd256() {
4792        let a = _mm_setr_pd(1., 2.);
4793        let r = _mm256_castpd128_pd256(a);
4794        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4795    }
4796
4797    #[simd_test(enable = "avx")]
4798    unsafe fn test_mm256_castsi128_si256() {
4799        let a = _mm_setr_epi32(1, 2, 3, 4);
4800        let r = _mm256_castsi128_si256(a);
4801        assert_eq_m128i(_mm256_castsi256_si128(r), a);
4802    }
4803
4804    #[simd_test(enable = "avx")]
4805    unsafe fn test_mm256_zextps128_ps256() {
4806        let a = _mm_setr_ps(1., 2., 3., 4.);
4807        let r = _mm256_zextps128_ps256(a);
4808        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4809        assert_eq_m256(r, e);
4810    }
4811
4812    #[simd_test(enable = "avx")]
4813    unsafe fn test_mm256_zextsi128_si256() {
4814        let a = _mm_setr_epi64x(1, 2);
4815        let r = _mm256_zextsi128_si256(a);
4816        let e = _mm256_setr_epi64x(1, 2, 0, 0);
4817        assert_eq_m256i(r, e);
4818    }
4819
4820    #[simd_test(enable = "avx")]
4821    unsafe fn test_mm256_zextpd128_pd256() {
4822        let a = _mm_setr_pd(1., 2.);
4823        let r = _mm256_zextpd128_pd256(a);
4824        let e = _mm256_setr_pd(1., 2., 0., 0.);
4825        assert_eq_m256d(r, e);
4826    }
4827
4828    #[simd_test(enable = "avx")]
4829    unsafe fn test_mm256_set_m128() {
4830        let hi = _mm_setr_ps(5., 6., 7., 8.);
4831        let lo = _mm_setr_ps(1., 2., 3., 4.);
4832        let r = _mm256_set_m128(hi, lo);
4833        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4834        assert_eq_m256(r, e);
4835    }
4836
4837    #[simd_test(enable = "avx")]
4838    unsafe fn test_mm256_set_m128d() {
4839        let hi = _mm_setr_pd(3., 4.);
4840        let lo = _mm_setr_pd(1., 2.);
4841        let r = _mm256_set_m128d(hi, lo);
4842        let e = _mm256_setr_pd(1., 2., 3., 4.);
4843        assert_eq_m256d(r, e);
4844    }
4845
4846    #[simd_test(enable = "avx")]
4847    unsafe fn test_mm256_set_m128i() {
4848        #[rustfmt::skip]
4849        let hi = _mm_setr_epi8(
4850            17, 18, 19, 20,
4851            21, 22, 23, 24,
4852            25, 26, 27, 28,
4853            29, 30, 31, 32,
4854        );
4855        #[rustfmt::skip]
4856        let lo = _mm_setr_epi8(
4857            1, 2, 3, 4,
4858            5, 6, 7, 8,
4859            9, 10, 11, 12,
4860            13, 14, 15, 16,
4861        );
4862        let r = _mm256_set_m128i(hi, lo);
4863        #[rustfmt::skip]
4864        let e = _mm256_setr_epi8(
4865            1, 2, 3, 4, 5, 6, 7, 8,
4866            9, 10, 11, 12, 13, 14, 15, 16,
4867            17, 18, 19, 20, 21, 22, 23, 24,
4868            25, 26, 27, 28, 29, 30, 31, 32,
4869        );
4870        assert_eq_m256i(r, e);
4871    }
4872
4873    #[simd_test(enable = "avx")]
4874    unsafe fn test_mm256_setr_m128() {
4875        let lo = _mm_setr_ps(1., 2., 3., 4.);
4876        let hi = _mm_setr_ps(5., 6., 7., 8.);
4877        let r = _mm256_setr_m128(lo, hi);
4878        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4879        assert_eq_m256(r, e);
4880    }
4881
4882    #[simd_test(enable = "avx")]
4883    unsafe fn test_mm256_setr_m128d() {
4884        let lo = _mm_setr_pd(1., 2.);
4885        let hi = _mm_setr_pd(3., 4.);
4886        let r = _mm256_setr_m128d(lo, hi);
4887        let e = _mm256_setr_pd(1., 2., 3., 4.);
4888        assert_eq_m256d(r, e);
4889    }
4890
4891    #[simd_test(enable = "avx")]
4892    unsafe fn test_mm256_setr_m128i() {
4893        #[rustfmt::skip]
4894        let lo = _mm_setr_epi8(
4895            1, 2, 3, 4,
4896            5, 6, 7, 8,
4897            9, 10, 11, 12,
4898            13, 14, 15, 16,
4899        );
4900        #[rustfmt::skip]
4901        let hi = _mm_setr_epi8(
4902            17, 18, 19, 20, 21, 22, 23, 24,
4903            25, 26, 27, 28, 29, 30, 31, 32,
4904        );
4905        let r = _mm256_setr_m128i(lo, hi);
4906        #[rustfmt::skip]
4907        let e = _mm256_setr_epi8(
4908            1, 2, 3, 4, 5, 6, 7, 8,
4909            9, 10, 11, 12, 13, 14, 15, 16,
4910            17, 18, 19, 20, 21, 22, 23, 24,
4911            25, 26, 27, 28, 29, 30, 31, 32,
4912        );
4913        assert_eq_m256i(r, e);
4914    }
4915
4916    #[simd_test(enable = "avx")]
4917    unsafe fn test_mm256_loadu2_m128() {
4918        let hi = &[5., 6., 7., 8.];
4919        let hiaddr = hi.as_ptr();
4920        let lo = &[1., 2., 3., 4.];
4921        let loaddr = lo.as_ptr();
4922        let r = _mm256_loadu2_m128(hiaddr, loaddr);
4923        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4924        assert_eq_m256(r, e);
4925    }
4926
4927    #[simd_test(enable = "avx")]
4928    unsafe fn test_mm256_loadu2_m128d() {
4929        let hi = &[3., 4.];
4930        let hiaddr = hi.as_ptr();
4931        let lo = &[1., 2.];
4932        let loaddr = lo.as_ptr();
4933        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4934        let e = _mm256_setr_pd(1., 2., 3., 4.);
4935        assert_eq_m256d(r, e);
4936    }
4937
4938    #[simd_test(enable = "avx")]
4939    unsafe fn test_mm256_loadu2_m128i() {
4940        #[rustfmt::skip]
4941        let hi = _mm_setr_epi8(
4942            17, 18, 19, 20, 21, 22, 23, 24,
4943            25, 26, 27, 28, 29, 30, 31, 32,
4944        );
4945        #[rustfmt::skip]
4946        let lo = _mm_setr_epi8(
4947            1, 2, 3, 4, 5, 6, 7, 8,
4948            9, 10, 11, 12, 13, 14, 15, 16,
4949        );
4950        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
4951        #[rustfmt::skip]
4952        let e = _mm256_setr_epi8(
4953            1, 2, 3, 4, 5, 6, 7, 8,
4954            9, 10, 11, 12, 13, 14, 15, 16,
4955            17, 18, 19, 20, 21, 22, 23, 24,
4956            25, 26, 27, 28, 29, 30, 31, 32,
4957        );
4958        assert_eq_m256i(r, e);
4959    }
4960
4961    #[simd_test(enable = "avx")]
4962    unsafe fn test_mm256_storeu2_m128() {
4963        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4964        let mut hi = _mm_undefined_ps();
4965        let mut lo = _mm_undefined_ps();
4966        _mm256_storeu2_m128(
4967            ptr::addr_of_mut!(hi) as *mut f32,
4968            ptr::addr_of_mut!(lo) as *mut f32,
4969            a,
4970        );
4971        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4972        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4973    }
4974
4975    #[simd_test(enable = "avx")]
4976    unsafe fn test_mm256_storeu2_m128d() {
4977        let a = _mm256_setr_pd(1., 2., 3., 4.);
4978        let mut hi = _mm_undefined_pd();
4979        let mut lo = _mm_undefined_pd();
4980        _mm256_storeu2_m128d(
4981            ptr::addr_of_mut!(hi) as *mut f64,
4982            ptr::addr_of_mut!(lo) as *mut f64,
4983            a,
4984        );
4985        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4986        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4987    }
4988
4989    #[simd_test(enable = "avx")]
4990    unsafe fn test_mm256_storeu2_m128i() {
4991        #[rustfmt::skip]
4992        let a = _mm256_setr_epi8(
4993            1, 2, 3, 4, 5, 6, 7, 8,
4994            9, 10, 11, 12, 13, 14, 15, 16,
4995            17, 18, 19, 20, 21, 22, 23, 24,
4996            25, 26, 27, 28, 29, 30, 31, 32,
4997        );
4998        let mut hi = _mm_undefined_si128();
4999        let mut lo = _mm_undefined_si128();
5000        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5001        #[rustfmt::skip]
5002        let e_hi = _mm_setr_epi8(
5003            17, 18, 19, 20, 21, 22, 23, 24,
5004            25, 26, 27, 28, 29, 30, 31, 32
5005        );
5006        #[rustfmt::skip]
5007        let e_lo = _mm_setr_epi8(
5008            1, 2, 3, 4, 5, 6, 7, 8,
5009            9, 10, 11, 12, 13, 14, 15, 16
5010        );
5011
5012        assert_eq_m128i(hi, e_hi);
5013        assert_eq_m128i(lo, e_lo);
5014    }
5015
5016    #[simd_test(enable = "avx")]
5017    unsafe fn test_mm256_cvtss_f32() {
5018        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5019        let r = _mm256_cvtss_f32(a);
5020        assert_eq!(r, 1.);
5021    }
5022}