core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34    unsafe { simd_add(a, b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46    unsafe { simd_add(a, b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59    unsafe {
60        let a: u64x4 = transmute(a);
61        let b: u64x4 = transmute(b);
62        transmute(simd_and(a, b))
63    }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75    unsafe {
76        let a: u32x8 = transmute(a);
77        let b: u32x8 = transmute(b);
78        transmute(simd_and(a, b))
79    }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92    unsafe {
93        let a: u64x4 = transmute(a);
94        let b: u64x4 = transmute(b);
95        transmute(simd_or(a, b))
96    }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108    unsafe {
109        let a: u32x8 = transmute(a);
110        let b: u32x8 = transmute(b);
111        transmute(simd_or(a, b))
112    }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125    static_assert_uimm_bits!(MASK, 8);
126    unsafe {
127        simd_shuffle!(
128            a,
129            b,
130            [
131                MASK as u32 & 0b1,
132                ((MASK as u32 >> 1) & 0b1) + 4,
133                ((MASK as u32 >> 2) & 0b1) + 2,
134                ((MASK as u32 >> 3) & 0b1) + 6,
135            ],
136        )
137    }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150    static_assert_uimm_bits!(MASK, 8);
151    unsafe {
152        simd_shuffle!(
153            a,
154            b,
155            [
156                MASK as u32 & 0b11,
157                (MASK as u32 >> 2) & 0b11,
158                ((MASK as u32 >> 4) & 0b11) + 8,
159                ((MASK as u32 >> 6) & 0b11) + 8,
160                (MASK as u32 & 0b11) + 4,
161                ((MASK as u32 >> 2) & 0b11) + 4,
162                ((MASK as u32 >> 4) & 0b11) + 12,
163                ((MASK as u32 >> 6) & 0b11) + 12,
164            ],
165        )
166    }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178    unsafe {
179        let a: u64x4 = transmute(a);
180        let b: u64x4 = transmute(b);
181        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
182    }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195    unsafe {
196        let a: u32x8 = transmute(a);
197        let b: u32x8 = transmute(b);
198        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
199    }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211    unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223    unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235    unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247    unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259    unsafe { simd_mul(a, b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271    unsafe { simd_mul(a, b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283    unsafe {
284        let a = a.as_f64x4();
285        let b = b.as_f64x4();
286        let add = simd_add(a, b);
287        let sub = simd_sub(a, b);
288        simd_shuffle!(add, sub, [4, 1, 6, 3])
289    }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301    unsafe {
302        let a = a.as_f32x8();
303        let b = b.as_f32x8();
304        let add = simd_add(a, b);
305        let sub = simd_sub(a, b);
306        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307    }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319    unsafe { simd_sub(a, b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331    unsafe { simd_sub(a, b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343    unsafe { simd_div(a, b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355    unsafe { simd_div(a, b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377    static_assert_uimm_bits!(ROUNDING, 4);
378    unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390    unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402    unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424    static_assert_uimm_bits!(ROUNDING, 4);
425    unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437    unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449    unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461    unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473    unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489    static_assert_uimm_bits!(IMM4, 4);
490    unsafe {
491        simd_shuffle!(
492            a,
493            b,
494            [
495                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499            ],
500        )
501    }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514    static_assert_uimm_bits!(IMM8, 8);
515    unsafe {
516        simd_shuffle!(
517            a,
518            b,
519            [
520                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528            ],
529        )
530    }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542    unsafe {
543        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
544        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
545    }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557    unsafe {
558        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
559        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
560    }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566///  using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575    static_assert_uimm_bits!(IMM8, 8);
576    unsafe { vdpps(a, b, IMM8 as i8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590    unsafe {
591        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
592        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
593        simd_add(even, odd)
594    }
595}
596
597/// Horizontal addition of adjacent pairs in the two packed vectors
598/// of 8 32-bit floating points `a` and `b`.
599/// In the result, sums of elements from `a` are returned in locations of
600/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
601/// 2, 3, 6, 7.
602///
603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
604#[inline]
605#[target_feature(enable = "avx")]
606#[cfg_attr(test, assert_instr(vhaddps))]
607#[stable(feature = "simd_x86", since = "1.27.0")]
608pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
609    unsafe {
610        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
611        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
612        simd_add(even, odd)
613    }
614}
615
616/// Horizontal subtraction of adjacent pairs in the two packed vectors
617/// of 4 64-bit floating points `a` and `b`.
618/// In the result, sums of elements from `a` are returned in even locations,
619/// while sums of elements from `b` are returned in odd locations.
620///
621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
622#[inline]
623#[target_feature(enable = "avx")]
624#[cfg_attr(test, assert_instr(vhsubpd))]
625#[stable(feature = "simd_x86", since = "1.27.0")]
626pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
627    unsafe {
628        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
629        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
630        simd_sub(even, odd)
631    }
632}
633
634/// Horizontal subtraction of adjacent pairs in the two packed vectors
635/// of 8 32-bit floating points `a` and `b`.
636/// In the result, sums of elements from `a` are returned in locations of
637/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
638/// 2, 3, 6, 7.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vhsubps))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
646    unsafe {
647        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
648        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
649        simd_sub(even, odd)
650    }
651}
652
653/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorp))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
662    unsafe {
663        let a: u64x4 = transmute(a);
664        let b: u64x4 = transmute(b);
665        transmute(simd_xor(a, b))
666    }
667}
668
669/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
670/// elements in `a` and `b`.
671///
672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
673#[inline]
674#[target_feature(enable = "avx")]
675#[cfg_attr(test, assert_instr(vxorps))]
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
678    unsafe {
679        let a: u32x8 = transmute(a);
680        let b: u32x8 = transmute(b);
681        transmute(simd_xor(a, b))
682    }
683}
684
685/// Equal (ordered, non-signaling)
686#[stable(feature = "simd_x86", since = "1.27.0")]
687pub const _CMP_EQ_OQ: i32 = 0x00;
688/// Less-than (ordered, signaling)
689#[stable(feature = "simd_x86", since = "1.27.0")]
690pub const _CMP_LT_OS: i32 = 0x01;
691/// Less-than-or-equal (ordered, signaling)
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub const _CMP_LE_OS: i32 = 0x02;
694/// Unordered (non-signaling)
695#[stable(feature = "simd_x86", since = "1.27.0")]
696pub const _CMP_UNORD_Q: i32 = 0x03;
697/// Not-equal (unordered, non-signaling)
698#[stable(feature = "simd_x86", since = "1.27.0")]
699pub const _CMP_NEQ_UQ: i32 = 0x04;
700/// Not-less-than (unordered, signaling)
701#[stable(feature = "simd_x86", since = "1.27.0")]
702pub const _CMP_NLT_US: i32 = 0x05;
703/// Not-less-than-or-equal (unordered, signaling)
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub const _CMP_NLE_US: i32 = 0x06;
706/// Ordered (non-signaling)
707#[stable(feature = "simd_x86", since = "1.27.0")]
708pub const _CMP_ORD_Q: i32 = 0x07;
709/// Equal (unordered, non-signaling)
710#[stable(feature = "simd_x86", since = "1.27.0")]
711pub const _CMP_EQ_UQ: i32 = 0x08;
712/// Not-greater-than-or-equal (unordered, signaling)
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub const _CMP_NGE_US: i32 = 0x09;
715/// Not-greater-than (unordered, signaling)
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub const _CMP_NGT_US: i32 = 0x0a;
718/// False (ordered, non-signaling)
719#[stable(feature = "simd_x86", since = "1.27.0")]
720pub const _CMP_FALSE_OQ: i32 = 0x0b;
721/// Not-equal (ordered, non-signaling)
722#[stable(feature = "simd_x86", since = "1.27.0")]
723pub const _CMP_NEQ_OQ: i32 = 0x0c;
724/// Greater-than-or-equal (ordered, signaling)
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub const _CMP_GE_OS: i32 = 0x0d;
727/// Greater-than (ordered, signaling)
728#[stable(feature = "simd_x86", since = "1.27.0")]
729pub const _CMP_GT_OS: i32 = 0x0e;
730/// True (unordered, non-signaling)
731#[stable(feature = "simd_x86", since = "1.27.0")]
732pub const _CMP_TRUE_UQ: i32 = 0x0f;
733/// Equal (ordered, signaling)
734#[stable(feature = "simd_x86", since = "1.27.0")]
735pub const _CMP_EQ_OS: i32 = 0x10;
736/// Less-than (ordered, non-signaling)
737#[stable(feature = "simd_x86", since = "1.27.0")]
738pub const _CMP_LT_OQ: i32 = 0x11;
739/// Less-than-or-equal (ordered, non-signaling)
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub const _CMP_LE_OQ: i32 = 0x12;
742/// Unordered (signaling)
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub const _CMP_UNORD_S: i32 = 0x13;
745/// Not-equal (unordered, signaling)
746#[stable(feature = "simd_x86", since = "1.27.0")]
747pub const _CMP_NEQ_US: i32 = 0x14;
748/// Not-less-than (unordered, non-signaling)
749#[stable(feature = "simd_x86", since = "1.27.0")]
750pub const _CMP_NLT_UQ: i32 = 0x15;
751/// Not-less-than-or-equal (unordered, non-signaling)
752#[stable(feature = "simd_x86", since = "1.27.0")]
753pub const _CMP_NLE_UQ: i32 = 0x16;
754/// Ordered (signaling)
755#[stable(feature = "simd_x86", since = "1.27.0")]
756pub const _CMP_ORD_S: i32 = 0x17;
757/// Equal (unordered, signaling)
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub const _CMP_EQ_US: i32 = 0x18;
760/// Not-greater-than-or-equal (unordered, non-signaling)
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub const _CMP_NGE_UQ: i32 = 0x19;
763/// Not-greater-than (unordered, non-signaling)
764#[stable(feature = "simd_x86", since = "1.27.0")]
765pub const _CMP_NGT_UQ: i32 = 0x1a;
766/// False (ordered, signaling)
767#[stable(feature = "simd_x86", since = "1.27.0")]
768pub const _CMP_FALSE_OS: i32 = 0x1b;
769/// Not-equal (ordered, signaling)
770#[stable(feature = "simd_x86", since = "1.27.0")]
771pub const _CMP_NEQ_OS: i32 = 0x1c;
772/// Greater-than-or-equal (ordered, non-signaling)
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub const _CMP_GE_OQ: i32 = 0x1d;
775/// Greater-than (ordered, non-signaling)
776#[stable(feature = "simd_x86", since = "1.27.0")]
777pub const _CMP_GT_OQ: i32 = 0x1e;
778/// True (unordered, signaling)
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub const _CMP_TRUE_US: i32 = 0x1f;
781
782/// Compares packed double-precision (64-bit) floating-point
783/// elements in `a` and `b` based on the comparison operand
784/// specified by `IMM5`.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
787#[inline]
788#[target_feature(enable = "avx")]
789#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
790#[rustc_legacy_const_generics(2)]
791#[stable(feature = "simd_x86", since = "1.27.0")]
792pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
793    static_assert_uimm_bits!(IMM5, 5);
794    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
795}
796
797/// Compares packed double-precision (64-bit) floating-point
798/// elements in `a` and `b` based on the comparison operand
799/// specified by `IMM5`.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
802#[inline]
803#[target_feature(enable = "avx")]
804#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
805#[rustc_legacy_const_generics(2)]
806#[stable(feature = "simd_x86", since = "1.27.0")]
807pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
808    static_assert_uimm_bits!(IMM5, 5);
809    unsafe { vcmppd256(a, b, IMM5 as u8) }
810}
811
812/// Compares packed single-precision (32-bit) floating-point
813/// elements in `a` and `b` based on the comparison operand
814/// specified by `IMM5`.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
817#[inline]
818#[target_feature(enable = "avx")]
819#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
820#[rustc_legacy_const_generics(2)]
821#[stable(feature = "simd_x86", since = "1.27.0")]
822pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
823    static_assert_uimm_bits!(IMM5, 5);
824    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
825}
826
827/// Compares packed single-precision (32-bit) floating-point
828/// elements in `a` and `b` based on the comparison operand
829/// specified by `IMM5`.
830///
831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
832#[inline]
833#[target_feature(enable = "avx")]
834#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
835#[rustc_legacy_const_generics(2)]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
838    static_assert_uimm_bits!(IMM5, 5);
839    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
840}
841
842/// Compares the lower double-precision (64-bit) floating-point element in
843/// `a` and `b` based on the comparison operand specified by `IMM5`,
844/// store the result in the lower element of returned vector,
845/// and copies the upper element from `a` to the upper element of returned
846/// vector.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpsd(a, b, IMM5 as i8) }
857}
858
859/// Compares the lower single-precision (32-bit) floating-point element in
860/// `a` and `b` based on the comparison operand specified by `IMM5`,
861/// store the result in the lower element of returned vector,
862/// and copies the upper 3 packed elements from `a` to the upper elements of
863/// returned vector.
864///
865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
866#[inline]
867#[target_feature(enable = "avx")]
868#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
869#[rustc_legacy_const_generics(2)]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
872    static_assert_uimm_bits!(IMM5, 5);
873    unsafe { vcmpss(a, b, IMM5 as i8) }
874}
875
876/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
877/// floating-point elements.
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
880#[inline]
881#[target_feature(enable = "avx")]
882#[cfg_attr(test, assert_instr(vcvtdq2pd))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
885    unsafe { simd_cast(a.as_i32x4()) }
886}
887
888/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
889/// floating-point elements.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
892#[inline]
893#[target_feature(enable = "avx")]
894#[cfg_attr(test, assert_instr(vcvtdq2ps))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
897    unsafe { simd_cast(a.as_i32x8()) }
898}
899
900/// Converts packed double-precision (64-bit) floating-point elements in `a`
901/// to packed single-precision (32-bit) floating-point elements.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
904#[inline]
905#[target_feature(enable = "avx")]
906#[cfg_attr(test, assert_instr(vcvtpd2ps))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
909    unsafe { simd_cast(a) }
910}
911
912/// Converts packed single-precision (32-bit) floating-point elements in `a`
913/// to packed 32-bit integers.
914///
915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
916#[inline]
917#[target_feature(enable = "avx")]
918#[cfg_attr(test, assert_instr(vcvtps2dq))]
919#[stable(feature = "simd_x86", since = "1.27.0")]
920pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
921    unsafe { transmute(vcvtps2dq(a)) }
922}
923
924/// Converts packed single-precision (32-bit) floating-point elements in `a`
925/// to packed double-precision (64-bit) floating-point elements.
926///
927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
928#[inline]
929#[target_feature(enable = "avx")]
930#[cfg_attr(test, assert_instr(vcvtps2pd))]
931#[stable(feature = "simd_x86", since = "1.27.0")]
932pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
933    unsafe { simd_cast(a) }
934}
935
936/// Returns the first element of the input vector of `[4 x double]`.
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
939#[inline]
940#[target_feature(enable = "avx")]
941//#[cfg_attr(test, assert_instr(movsd))] FIXME
942#[stable(feature = "simd_x86", since = "1.27.0")]
943pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
944    unsafe { simd_extract!(a, 0) }
945}
946
947/// Converts packed double-precision (64-bit) floating-point elements in `a`
948/// to packed 32-bit integers with truncation.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvttpd2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
956    unsafe { transmute(vcvttpd2dq(a)) }
957}
958
959/// Converts packed double-precision (64-bit) floating-point elements in `a`
960/// to packed 32-bit integers.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtpd2dq))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
968    unsafe { transmute(vcvtpd2dq(a)) }
969}
970
971/// Converts packed single-precision (32-bit) floating-point elements in `a`
972/// to packed 32-bit integers with truncation.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
975#[inline]
976#[target_feature(enable = "avx")]
977#[cfg_attr(test, assert_instr(vcvttps2dq))]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
980    unsafe { transmute(vcvttps2dq(a)) }
981}
982
983/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
984/// floating-point elements) from `a`, selected with `imm8`.
985///
986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
987#[inline]
988#[target_feature(enable = "avx")]
989#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
990#[rustc_legacy_const_generics(1)]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
993    static_assert_uimm_bits!(IMM1, 1);
994    unsafe {
995        simd_shuffle!(
996            a,
997            _mm256_undefined_ps(),
998            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
999        )
1000    }
1001}
1002
1003/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1004/// floating-point elements) from `a`, selected with `imm8`.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1007#[inline]
1008#[target_feature(enable = "avx")]
1009#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1010#[rustc_legacy_const_generics(1)]
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1013    static_assert_uimm_bits!(IMM1, 1);
1014    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1015}
1016
1017/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1018///
1019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1020#[inline]
1021#[target_feature(enable = "avx")]
1022#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1023#[rustc_legacy_const_generics(1)]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1026    static_assert_uimm_bits!(IMM1, 1);
1027    unsafe {
1028        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1029        transmute(dst)
1030    }
1031}
1032
1033/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1034///
1035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1036#[inline]
1037#[target_feature(enable = "avx")]
1038// This intrinsic has no corresponding instruction.
1039#[rustc_legacy_const_generics(1)]
1040#[stable(feature = "simd_x86", since = "1.27.0")]
1041pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1042    static_assert_uimm_bits!(INDEX, 3);
1043    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1044}
1045
1046/// Returns the first element of the input vector of `[8 x i32]`.
1047///
1048/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1049#[inline]
1050#[target_feature(enable = "avx")]
1051#[stable(feature = "simd_x86", since = "1.27.0")]
1052pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1053    unsafe { simd_extract!(a.as_i32x8(), 0) }
1054}
1055
1056/// Zeroes the contents of all XMM or YMM registers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vzeroall))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm256_zeroall() {
1064    unsafe { vzeroall() }
1065}
1066
1067/// Zeroes the upper 128 bits of all YMM registers;
1068/// the lower 128-bits of the registers are unmodified.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1071#[inline]
1072#[target_feature(enable = "avx")]
1073#[cfg_attr(test, assert_instr(vzeroupper))]
1074#[stable(feature = "simd_x86", since = "1.27.0")]
1075pub fn _mm256_zeroupper() {
1076    unsafe { vzeroupper() }
1077}
1078
1079/// Shuffles single-precision (32-bit) floating-point elements in `a`
1080/// within 128-bit lanes using the control in `b`.
1081///
1082/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1083#[inline]
1084#[target_feature(enable = "avx")]
1085#[cfg_attr(test, assert_instr(vpermilps))]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1088    unsafe { vpermilps256(a, b.as_i32x8()) }
1089}
1090
1091/// Shuffles single-precision (32-bit) floating-point elements in `a`
1092/// using the control in `b`.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1095#[inline]
1096#[target_feature(enable = "avx")]
1097#[cfg_attr(test, assert_instr(vpermilps))]
1098#[stable(feature = "simd_x86", since = "1.27.0")]
1099pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1100    unsafe { vpermilps(a, b.as_i32x4()) }
1101}
1102
1103/// Shuffles single-precision (32-bit) floating-point elements in `a`
1104/// within 128-bit lanes using the control in `imm8`.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1107#[inline]
1108#[target_feature(enable = "avx")]
1109#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1110#[rustc_legacy_const_generics(1)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1113    static_assert_uimm_bits!(IMM8, 8);
1114    unsafe {
1115        simd_shuffle!(
1116            a,
1117            _mm256_undefined_ps(),
1118            [
1119                (IMM8 as u32 >> 0) & 0b11,
1120                (IMM8 as u32 >> 2) & 0b11,
1121                (IMM8 as u32 >> 4) & 0b11,
1122                (IMM8 as u32 >> 6) & 0b11,
1123                ((IMM8 as u32 >> 0) & 0b11) + 4,
1124                ((IMM8 as u32 >> 2) & 0b11) + 4,
1125                ((IMM8 as u32 >> 4) & 0b11) + 4,
1126                ((IMM8 as u32 >> 6) & 0b11) + 4,
1127            ],
1128        )
1129    }
1130}
1131
1132/// Shuffles single-precision (32-bit) floating-point elements in `a`
1133/// using the control in `imm8`.
1134///
1135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1136#[inline]
1137#[target_feature(enable = "avx")]
1138#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1139#[rustc_legacy_const_generics(1)]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1142    static_assert_uimm_bits!(IMM8, 8);
1143    unsafe {
1144        simd_shuffle!(
1145            a,
1146            _mm_undefined_ps(),
1147            [
1148                (IMM8 as u32 >> 0) & 0b11,
1149                (IMM8 as u32 >> 2) & 0b11,
1150                (IMM8 as u32 >> 4) & 0b11,
1151                (IMM8 as u32 >> 6) & 0b11,
1152            ],
1153        )
1154    }
1155}
1156
1157/// Shuffles double-precision (64-bit) floating-point elements in `a`
1158/// within 256-bit lanes using the control in `b`.
1159///
1160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1161#[inline]
1162#[target_feature(enable = "avx")]
1163#[cfg_attr(test, assert_instr(vpermilpd))]
1164#[stable(feature = "simd_x86", since = "1.27.0")]
1165pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1166    unsafe { vpermilpd256(a, b.as_i64x4()) }
1167}
1168
1169/// Shuffles double-precision (64-bit) floating-point elements in `a`
1170/// using the control in `b`.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1173#[inline]
1174#[target_feature(enable = "avx")]
1175#[cfg_attr(test, assert_instr(vpermilpd))]
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1178    unsafe { vpermilpd(a, b.as_i64x2()) }
1179}
1180
1181/// Shuffles double-precision (64-bit) floating-point elements in `a`
1182/// within 128-bit lanes using the control in `imm8`.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1185#[inline]
1186#[target_feature(enable = "avx")]
1187#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1188#[rustc_legacy_const_generics(1)]
1189#[stable(feature = "simd_x86", since = "1.27.0")]
1190pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1191    static_assert_uimm_bits!(IMM4, 4);
1192    unsafe {
1193        simd_shuffle!(
1194            a,
1195            _mm256_undefined_pd(),
1196            [
1197                ((IMM4 as u32 >> 0) & 1),
1198                ((IMM4 as u32 >> 1) & 1),
1199                ((IMM4 as u32 >> 2) & 1) + 2,
1200                ((IMM4 as u32 >> 3) & 1) + 2,
1201            ],
1202        )
1203    }
1204}
1205
1206/// Shuffles double-precision (64-bit) floating-point elements in `a`
1207/// using the control in `imm8`.
1208///
1209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1210#[inline]
1211#[target_feature(enable = "avx")]
1212#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1213#[rustc_legacy_const_generics(1)]
1214#[stable(feature = "simd_x86", since = "1.27.0")]
1215pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1216    static_assert_uimm_bits!(IMM2, 2);
1217    unsafe {
1218        simd_shuffle!(
1219            a,
1220            _mm_undefined_pd(),
1221            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1222        )
1223    }
1224}
1225
1226/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1227/// floating-point elements) selected by `imm8` from `a` and `b`.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1230#[inline]
1231#[target_feature(enable = "avx")]
1232#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1236    static_assert_uimm_bits!(IMM8, 8);
1237    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1238        _mm256_castps_si256(a),
1239        _mm256_castps_si256(b),
1240    ))
1241}
1242
1243/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1244/// floating-point elements) selected by `imm8` from `a` and `b`.
1245///
1246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1247#[inline]
1248#[target_feature(enable = "avx")]
1249#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1250#[rustc_legacy_const_generics(2)]
1251#[stable(feature = "simd_x86", since = "1.27.0")]
1252pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1253    static_assert_uimm_bits!(IMM8, 8);
1254    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1255        _mm256_castpd_si256(a),
1256        _mm256_castpd_si256(b),
1257    ))
1258}
1259
1260/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1261/// from `a` and `b`.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1264#[inline]
1265#[target_feature(enable = "avx")]
1266#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1267#[rustc_legacy_const_generics(2)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1270    static_assert_uimm_bits!(IMM8, 8);
1271    const fn idx(imm8: i32, pos: u32) -> u32 {
1272        let part = if pos < 2 {
1273            imm8 & 0xf
1274        } else {
1275            (imm8 & 0xf0) >> 4
1276        };
1277        2 * (part as u32 & 0b11) + (pos & 1)
1278    }
1279    const fn idx0(imm8: i32, pos: u32) -> u32 {
1280        let part = if pos < 2 {
1281            imm8 & 0xf
1282        } else {
1283            (imm8 & 0xf0) >> 4
1284        };
1285        if part & 0b1000 != 0 { 4 } else { pos }
1286    }
1287    unsafe {
1288        let r = simd_shuffle!(
1289            a.as_i64x4(),
1290            b.as_i64x4(),
1291            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1292        );
1293        let r: i64x4 = simd_shuffle!(
1294            r,
1295            i64x4::ZERO,
1296            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1297        );
1298        r.as_m256i()
1299    }
1300}
1301
1302/// Broadcasts a single-precision (32-bit) floating-point element from memory
1303/// to all elements of the returned vector.
1304///
1305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1306#[inline]
1307#[target_feature(enable = "avx")]
1308#[cfg_attr(test, assert_instr(vbroadcastss))]
1309#[stable(feature = "simd_x86", since = "1.27.0")]
1310#[allow(clippy::trivially_copy_pass_by_ref)]
1311pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1312    _mm256_set1_ps(*f)
1313}
1314
1315/// Broadcasts a single-precision (32-bit) floating-point element from memory
1316/// to all elements of the returned vector.
1317///
1318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1319#[inline]
1320#[target_feature(enable = "avx")]
1321#[cfg_attr(test, assert_instr(vbroadcastss))]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323#[allow(clippy::trivially_copy_pass_by_ref)]
1324pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
1325    _mm_set1_ps(*f)
1326}
1327
1328/// Broadcasts a double-precision (64-bit) floating-point element from memory
1329/// to all elements of the returned vector.
1330///
1331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1332#[inline]
1333#[target_feature(enable = "avx")]
1334#[cfg_attr(test, assert_instr(vbroadcastsd))]
1335#[stable(feature = "simd_x86", since = "1.27.0")]
1336#[allow(clippy::trivially_copy_pass_by_ref)]
1337pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1338    _mm256_set1_pd(*f)
1339}
1340
1341/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1342/// (32-bit) floating-point elements) to all elements of the returned vector.
1343///
1344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1345#[inline]
1346#[target_feature(enable = "avx")]
1347#[cfg_attr(test, assert_instr(vbroadcastf128))]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1350    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1351}
1352
1353/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1354/// (64-bit) floating-point elements) to all elements of the returned vector.
1355///
1356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1357#[inline]
1358#[target_feature(enable = "avx")]
1359#[cfg_attr(test, assert_instr(vbroadcastf128))]
1360#[stable(feature = "simd_x86", since = "1.27.0")]
1361pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1362    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1363}
1364
1365/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1366/// single-precision (32-bit) floating-point elements) from `b` into result
1367/// at the location specified by `imm8`.
1368///
1369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1370#[inline]
1371#[target_feature(enable = "avx")]
1372#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1373#[rustc_legacy_const_generics(2)]
1374#[stable(feature = "simd_x86", since = "1.27.0")]
1375pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1376    static_assert_uimm_bits!(IMM1, 1);
1377    unsafe {
1378        simd_shuffle!(
1379            a,
1380            _mm256_castps128_ps256(b),
1381            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1382        )
1383    }
1384}
1385
1386/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1387/// double-precision (64-bit) floating-point elements) from `b` into result
1388/// at the location specified by `imm8`.
1389///
1390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1391#[inline]
1392#[target_feature(enable = "avx")]
1393#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1394#[rustc_legacy_const_generics(2)]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1397    static_assert_uimm_bits!(IMM1, 1);
1398    unsafe {
1399        simd_shuffle!(
1400            a,
1401            _mm256_castpd128_pd256(b),
1402            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1403        )
1404    }
1405}
1406
1407/// Copies `a` to result, then inserts 128 bits from `b` into result
1408/// at the location specified by `imm8`.
1409///
1410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1411#[inline]
1412#[target_feature(enable = "avx")]
1413#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1414#[rustc_legacy_const_generics(2)]
1415#[stable(feature = "simd_x86", since = "1.27.0")]
1416pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1417    static_assert_uimm_bits!(IMM1, 1);
1418    unsafe {
1419        let dst: i64x4 = simd_shuffle!(
1420            a.as_i64x4(),
1421            _mm256_castsi128_si256(b).as_i64x4(),
1422            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1423        );
1424        transmute(dst)
1425    }
1426}
1427
1428/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1429/// at the location specified by `index`.
1430///
1431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1432#[inline]
1433#[target_feature(enable = "avx")]
1434// This intrinsic has no corresponding instruction.
1435#[rustc_legacy_const_generics(2)]
1436#[stable(feature = "simd_x86", since = "1.27.0")]
1437pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1438    static_assert_uimm_bits!(INDEX, 5);
1439    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1440}
1441
1442/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1443/// at the location specified by `index`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448// This intrinsic has no corresponding instruction.
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1452    static_assert_uimm_bits!(INDEX, 4);
1453    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1454}
1455
1456/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1457/// at the location specified by `index`.
1458///
1459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1460#[inline]
1461#[target_feature(enable = "avx")]
1462// This intrinsic has no corresponding instruction.
1463#[rustc_legacy_const_generics(2)]
1464#[stable(feature = "simd_x86", since = "1.27.0")]
1465pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1466    static_assert_uimm_bits!(INDEX, 3);
1467    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1468}
1469
1470/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1471/// floating-point elements) from memory into result.
1472/// `mem_addr` must be aligned on a 32-byte boundary or a
1473/// general-protection exception may be generated.
1474///
1475/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1476#[inline]
1477#[target_feature(enable = "avx")]
1478#[cfg_attr(
1479    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1480    assert_instr(vmovap)
1481)]
1482#[stable(feature = "simd_x86", since = "1.27.0")]
1483#[allow(clippy::cast_ptr_alignment)]
1484pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1485    *(mem_addr as *const __m256d)
1486}
1487
1488/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1489/// floating-point elements) from `a` into memory.
1490/// `mem_addr` must be aligned on a 32-byte boundary or a
1491/// general-protection exception may be generated.
1492///
1493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1494#[inline]
1495#[target_feature(enable = "avx")]
1496#[cfg_attr(
1497    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1498    assert_instr(vmovap)
1499)]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501#[allow(clippy::cast_ptr_alignment)]
1502pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1503    *(mem_addr as *mut __m256d) = a;
1504}
1505
1506/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1507/// floating-point elements) from memory into result.
1508/// `mem_addr` must be aligned on a 32-byte boundary or a
1509/// general-protection exception may be generated.
1510///
1511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1512#[inline]
1513#[target_feature(enable = "avx")]
1514#[cfg_attr(
1515    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1516    assert_instr(vmovaps)
1517)]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519#[allow(clippy::cast_ptr_alignment)]
1520pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1521    *(mem_addr as *const __m256)
1522}
1523
1524/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1525/// floating-point elements) from `a` into memory.
1526/// `mem_addr` must be aligned on a 32-byte boundary or a
1527/// general-protection exception may be generated.
1528///
1529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1530#[inline]
1531#[target_feature(enable = "avx")]
1532#[cfg_attr(
1533    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1534    assert_instr(vmovaps)
1535)]
1536#[stable(feature = "simd_x86", since = "1.27.0")]
1537#[allow(clippy::cast_ptr_alignment)]
1538pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1539    *(mem_addr as *mut __m256) = a;
1540}
1541
1542/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1543/// floating-point elements) from memory into result.
1544/// `mem_addr` does not need to be aligned on any particular boundary.
1545///
1546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1547#[inline]
1548#[target_feature(enable = "avx")]
1549#[cfg_attr(test, assert_instr(vmovup))]
1550#[stable(feature = "simd_x86", since = "1.27.0")]
1551pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1552    let mut dst = _mm256_undefined_pd();
1553    ptr::copy_nonoverlapping(
1554        mem_addr as *const u8,
1555        ptr::addr_of_mut!(dst) as *mut u8,
1556        mem::size_of::<__m256d>(),
1557    );
1558    dst
1559}
1560
1561/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1562/// floating-point elements) from `a` into memory.
1563/// `mem_addr` does not need to be aligned on any particular boundary.
1564///
1565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1566#[inline]
1567#[target_feature(enable = "avx")]
1568#[cfg_attr(test, assert_instr(vmovup))]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1571    mem_addr.cast::<__m256d>().write_unaligned(a);
1572}
1573
1574/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1575/// floating-point elements) from memory into result.
1576/// `mem_addr` does not need to be aligned on any particular boundary.
1577///
1578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1579#[inline]
1580#[target_feature(enable = "avx")]
1581#[cfg_attr(test, assert_instr(vmovups))]
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1584    let mut dst = _mm256_undefined_ps();
1585    ptr::copy_nonoverlapping(
1586        mem_addr as *const u8,
1587        ptr::addr_of_mut!(dst) as *mut u8,
1588        mem::size_of::<__m256>(),
1589    );
1590    dst
1591}
1592
1593/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1594/// floating-point elements) from `a` into memory.
1595/// `mem_addr` does not need to be aligned on any particular boundary.
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1598#[inline]
1599#[target_feature(enable = "avx")]
1600#[cfg_attr(test, assert_instr(vmovups))]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1603    mem_addr.cast::<__m256>().write_unaligned(a);
1604}
1605
1606/// Loads 256-bits of integer data from memory into result.
1607/// `mem_addr` must be aligned on a 32-byte boundary or a
1608/// general-protection exception may be generated.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(
1614    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1615    assert_instr(vmovaps)
1616)] // FIXME vmovdqa expected
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1619    *mem_addr
1620}
1621
1622/// Stores 256-bits of integer data from `a` into memory.
1623/// `mem_addr` must be aligned on a 32-byte boundary or a
1624/// general-protection exception may be generated.
1625///
1626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1627#[inline]
1628#[target_feature(enable = "avx")]
1629#[cfg_attr(
1630    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1631    assert_instr(vmovaps)
1632)] // FIXME vmovdqa expected
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1635    *mem_addr = a;
1636}
1637
1638/// Loads 256-bits of integer data from memory into result.
1639/// `mem_addr` does not need to be aligned on any particular boundary.
1640///
1641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1642#[inline]
1643#[target_feature(enable = "avx")]
1644#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1647    let mut dst = _mm256_undefined_si256();
1648    ptr::copy_nonoverlapping(
1649        mem_addr as *const u8,
1650        ptr::addr_of_mut!(dst) as *mut u8,
1651        mem::size_of::<__m256i>(),
1652    );
1653    dst
1654}
1655
1656/// Stores 256-bits of integer data from `a` into memory.
1657/// `mem_addr` does not need to be aligned on any particular boundary.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1665    mem_addr.write_unaligned(a);
1666}
1667
1668/// Loads packed double-precision (64-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovpd))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1678    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1679    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd())
1680}
1681
1682/// Stores packed double-precision (64-bit) floating-point elements from `a`
1683/// into memory using `mask`.
1684///
1685/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1686#[inline]
1687#[target_feature(enable = "avx")]
1688#[cfg_attr(test, assert_instr(vmaskmovpd))]
1689#[stable(feature = "simd_x86", since = "1.27.0")]
1690pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1691    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1692    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1693}
1694
1695/// Loads packed double-precision (64-bit) floating-point elements from memory
1696/// into result using `mask` (elements are zeroed out when the high bit of the
1697/// corresponding element is not set).
1698///
1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1700#[inline]
1701#[target_feature(enable = "avx")]
1702#[cfg_attr(test, assert_instr(vmaskmovpd))]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1705    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1706    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd())
1707}
1708
1709/// Stores packed double-precision (64-bit) floating-point elements from `a`
1710/// into memory using `mask`.
1711///
1712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1713#[inline]
1714#[target_feature(enable = "avx")]
1715#[cfg_attr(test, assert_instr(vmaskmovpd))]
1716#[stable(feature = "simd_x86", since = "1.27.0")]
1717pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1718    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1719    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1720}
1721
1722/// Loads packed single-precision (32-bit) floating-point elements from memory
1723/// into result using `mask` (elements are zeroed out when the high bit of the
1724/// corresponding element is not set).
1725///
1726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1727#[inline]
1728#[target_feature(enable = "avx")]
1729#[cfg_attr(test, assert_instr(vmaskmovps))]
1730#[stable(feature = "simd_x86", since = "1.27.0")]
1731pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1732    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1733    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps())
1734}
1735
1736/// Stores packed single-precision (32-bit) floating-point elements from `a`
1737/// into memory using `mask`.
1738///
1739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1740#[inline]
1741#[target_feature(enable = "avx")]
1742#[cfg_attr(test, assert_instr(vmaskmovps))]
1743#[stable(feature = "simd_x86", since = "1.27.0")]
1744pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1745    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1746    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1747}
1748
1749/// Loads packed single-precision (32-bit) floating-point elements from memory
1750/// into result using `mask` (elements are zeroed out when the high bit of the
1751/// corresponding element is not set).
1752///
1753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1754#[inline]
1755#[target_feature(enable = "avx")]
1756#[cfg_attr(test, assert_instr(vmaskmovps))]
1757#[stable(feature = "simd_x86", since = "1.27.0")]
1758pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1759    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1760    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps())
1761}
1762
1763/// Stores packed single-precision (32-bit) floating-point elements from `a`
1764/// into memory using `mask`.
1765///
1766/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1767#[inline]
1768#[target_feature(enable = "avx")]
1769#[cfg_attr(test, assert_instr(vmaskmovps))]
1770#[stable(feature = "simd_x86", since = "1.27.0")]
1771pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1772    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1773    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a)
1774}
1775
1776/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1777/// from `a`, and returns the results.
1778///
1779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1780#[inline]
1781#[target_feature(enable = "avx")]
1782#[cfg_attr(test, assert_instr(vmovshdup))]
1783#[stable(feature = "simd_x86", since = "1.27.0")]
1784pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1785    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1786}
1787
1788/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1789/// from `a`, and returns the results.
1790///
1791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1792#[inline]
1793#[target_feature(enable = "avx")]
1794#[cfg_attr(test, assert_instr(vmovsldup))]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1797    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1798}
1799
1800/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1801/// from `a`, and returns the results.
1802///
1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1804#[inline]
1805#[target_feature(enable = "avx")]
1806#[cfg_attr(test, assert_instr(vmovddup))]
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1809    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1810}
1811
1812/// Loads 256-bits of integer data from unaligned memory into result.
1813/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1814/// data crosses a cache line boundary.
1815///
1816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1817#[inline]
1818#[target_feature(enable = "avx")]
1819#[cfg_attr(test, assert_instr(vlddqu))]
1820#[stable(feature = "simd_x86", since = "1.27.0")]
1821pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1822    transmute(vlddqu(mem_addr as *const i8))
1823}
1824
1825/// Moves integer data from a 256-bit integer vector to a 32-byte
1826/// aligned memory location. To minimize caching, the data is flagged as
1827/// non-temporal (unlikely to be used again soon)
1828///
1829/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1830///
1831/// # Safety of non-temporal stores
1832///
1833/// After using this intrinsic, but before any other access to the memory that this intrinsic
1834/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1835/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1836/// return.
1837///
1838/// See [`_mm_sfence`] for details.
1839#[inline]
1840#[target_feature(enable = "avx")]
1841#[cfg_attr(test, assert_instr(vmovntdq))]
1842#[stable(feature = "simd_x86", since = "1.27.0")]
1843pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1844    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1845    crate::arch::asm!(
1846        vps!("vmovntdq", ",{a}"),
1847        p = in(reg) mem_addr,
1848        a = in(ymm_reg) a,
1849        options(nostack, preserves_flags),
1850    );
1851}
1852
1853/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1854/// to a 32-byte aligned memory location. To minimize caching, the data is
1855/// flagged as non-temporal (unlikely to be used again soon).
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1858///
1859/// # Safety of non-temporal stores
1860///
1861/// After using this intrinsic, but before any other access to the memory that this intrinsic
1862/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1863/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1864/// return.
1865///
1866/// See [`_mm_sfence`] for details.
1867#[inline]
1868#[target_feature(enable = "avx")]
1869#[cfg_attr(test, assert_instr(vmovntpd))]
1870#[stable(feature = "simd_x86", since = "1.27.0")]
1871#[allow(clippy::cast_ptr_alignment)]
1872pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1873    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1874    crate::arch::asm!(
1875        vps!("vmovntpd", ",{a}"),
1876        p = in(reg) mem_addr,
1877        a = in(ymm_reg) a,
1878        options(nostack, preserves_flags),
1879    );
1880}
1881
1882/// Moves single-precision floating point values from a 256-bit vector
1883/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1884/// caching, the data is flagged as non-temporal (unlikely to be used again
1885/// soon).
1886///
1887/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1888///
1889/// # Safety of non-temporal stores
1890///
1891/// After using this intrinsic, but before any other access to the memory that this intrinsic
1892/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1893/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1894/// return.
1895///
1896/// See [`_mm_sfence`] for details.
1897#[inline]
1898#[target_feature(enable = "avx")]
1899#[cfg_attr(test, assert_instr(vmovntps))]
1900#[stable(feature = "simd_x86", since = "1.27.0")]
1901#[allow(clippy::cast_ptr_alignment)]
1902pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1903    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1904    crate::arch::asm!(
1905        vps!("vmovntps", ",{a}"),
1906        p = in(reg) mem_addr,
1907        a = in(ymm_reg) a,
1908        options(nostack, preserves_flags),
1909    );
1910}
1911
1912/// Computes the approximate reciprocal of packed single-precision (32-bit)
1913/// floating-point elements in `a`, and returns the results. The maximum
1914/// relative error for this approximation is less than 1.5*2^-12.
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1917#[inline]
1918#[target_feature(enable = "avx")]
1919#[cfg_attr(test, assert_instr(vrcpps))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1922    unsafe { vrcpps(a) }
1923}
1924
1925/// Computes the approximate reciprocal square root of packed single-precision
1926/// (32-bit) floating-point elements in `a`, and returns the results.
1927/// The maximum relative error for this approximation is less than 1.5*2^-12.
1928///
1929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1930#[inline]
1931#[target_feature(enable = "avx")]
1932#[cfg_attr(test, assert_instr(vrsqrtps))]
1933#[stable(feature = "simd_x86", since = "1.27.0")]
1934pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1935    unsafe { vrsqrtps(a) }
1936}
1937
1938/// Unpacks and interleave double-precision (64-bit) floating-point elements
1939/// from the high half of each 128-bit lane in `a` and `b`.
1940///
1941/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1942#[inline]
1943#[target_feature(enable = "avx")]
1944#[cfg_attr(test, assert_instr(vunpckhpd))]
1945#[stable(feature = "simd_x86", since = "1.27.0")]
1946pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1947    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1948}
1949
1950/// Unpacks and interleave single-precision (32-bit) floating-point elements
1951/// from the high half of each 128-bit lane in `a` and `b`.
1952///
1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1954#[inline]
1955#[target_feature(enable = "avx")]
1956#[cfg_attr(test, assert_instr(vunpckhps))]
1957#[stable(feature = "simd_x86", since = "1.27.0")]
1958pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1959    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1960}
1961
1962/// Unpacks and interleave double-precision (64-bit) floating-point elements
1963/// from the low half of each 128-bit lane in `a` and `b`.
1964///
1965/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1966#[inline]
1967#[target_feature(enable = "avx")]
1968#[cfg_attr(test, assert_instr(vunpcklpd))]
1969#[stable(feature = "simd_x86", since = "1.27.0")]
1970pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1971    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1972}
1973
1974/// Unpacks and interleave single-precision (32-bit) floating-point elements
1975/// from the low half of each 128-bit lane in `a` and `b`.
1976///
1977/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1978#[inline]
1979#[target_feature(enable = "avx")]
1980#[cfg_attr(test, assert_instr(vunpcklps))]
1981#[stable(feature = "simd_x86", since = "1.27.0")]
1982pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1983    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1984}
1985
1986/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1987/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1988/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1989/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1990///
1991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1992#[inline]
1993#[target_feature(enable = "avx")]
1994#[cfg_attr(test, assert_instr(vptest))]
1995#[stable(feature = "simd_x86", since = "1.27.0")]
1996pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1997    unsafe {
1998        let r = simd_and(a.as_i64x4(), b.as_i64x4());
1999        (0i64 == simd_reduce_or(r)) as i32
2000    }
2001}
2002
2003/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2004/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2005/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2006/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2009#[inline]
2010#[target_feature(enable = "avx")]
2011#[cfg_attr(test, assert_instr(vptest))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2014    unsafe {
2015        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2016        (0i64 == simd_reduce_or(r)) as i32
2017    }
2018}
2019
2020/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2021/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2022/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2023/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2024/// `CF` values are zero, otherwise return 0.
2025///
2026/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2027#[inline]
2028#[target_feature(enable = "avx")]
2029#[cfg_attr(test, assert_instr(vptest))]
2030#[stable(feature = "simd_x86", since = "1.27.0")]
2031pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2032    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2033}
2034
2035/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2036/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2037/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2038/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2039/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2040/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2041/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2044#[inline]
2045#[target_feature(enable = "avx")]
2046#[cfg_attr(test, assert_instr(vtestpd))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2049    unsafe { vtestzpd256(a, b) }
2050}
2051
2052/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2053/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2054/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2055/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2056/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2057/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2058/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2059///
2060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2061#[inline]
2062#[target_feature(enable = "avx")]
2063#[cfg_attr(test, assert_instr(vtestpd))]
2064#[stable(feature = "simd_x86", since = "1.27.0")]
2065pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2066    unsafe { vtestcpd256(a, b) }
2067}
2068
2069/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2070/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2071/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2072/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2073/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2074/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2075/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2076/// are zero, otherwise return 0.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2079#[inline]
2080#[target_feature(enable = "avx")]
2081#[cfg_attr(test, assert_instr(vtestpd))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2084    unsafe { vtestnzcpd256(a, b) }
2085}
2086
2087/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2088/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2089/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2090/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2091/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2092/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2093/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2094///
2095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2096#[inline]
2097#[target_feature(enable = "avx")]
2098#[cfg_attr(test, assert_instr(vtestpd))]
2099#[stable(feature = "simd_x86", since = "1.27.0")]
2100pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2101    unsafe {
2102        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2103        (0i64 == simd_reduce_or(r)) as i32
2104    }
2105}
2106
2107/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2108/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2109/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2110/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2111/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2112/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2113/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2116#[inline]
2117#[target_feature(enable = "avx")]
2118#[cfg_attr(test, assert_instr(vtestpd))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2121    unsafe {
2122        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2123        (0i64 == simd_reduce_or(r)) as i32
2124    }
2125}
2126
2127/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2128/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2129/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2130/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2131/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2132/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2133/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2134/// are zero, otherwise return 0.
2135///
2136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2137#[inline]
2138#[target_feature(enable = "avx")]
2139#[cfg_attr(test, assert_instr(vtestpd))]
2140#[stable(feature = "simd_x86", since = "1.27.0")]
2141pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2142    unsafe { vtestnzcpd(a, b) }
2143}
2144
2145/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2146/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2147/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2148/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2149/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2150/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2151/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2154#[inline]
2155#[target_feature(enable = "avx")]
2156#[cfg_attr(test, assert_instr(vtestps))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2159    unsafe { vtestzps256(a, b) }
2160}
2161
2162/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2163/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2164/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2165/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2166/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2167/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2168/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2169///
2170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2171#[inline]
2172#[target_feature(enable = "avx")]
2173#[cfg_attr(test, assert_instr(vtestps))]
2174#[stable(feature = "simd_x86", since = "1.27.0")]
2175pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2176    unsafe { vtestcps256(a, b) }
2177}
2178
2179/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2180/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2181/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2182/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2183/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2184/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2185/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2186/// are zero, otherwise return 0.
2187///
2188/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2189#[inline]
2190#[target_feature(enable = "avx")]
2191#[cfg_attr(test, assert_instr(vtestps))]
2192#[stable(feature = "simd_x86", since = "1.27.0")]
2193pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2194    unsafe { vtestnzcps256(a, b) }
2195}
2196
2197/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2198/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2199/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2200/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2201/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2202/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2203/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2204///
2205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2206#[inline]
2207#[target_feature(enable = "avx")]
2208#[cfg_attr(test, assert_instr(vtestps))]
2209#[stable(feature = "simd_x86", since = "1.27.0")]
2210pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2211    unsafe {
2212        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2213        (0i32 == simd_reduce_or(r)) as i32
2214    }
2215}
2216
2217/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2218/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2219/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2220/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2221/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2222/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2223/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2224///
2225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2226#[inline]
2227#[target_feature(enable = "avx")]
2228#[cfg_attr(test, assert_instr(vtestps))]
2229#[stable(feature = "simd_x86", since = "1.27.0")]
2230pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2231    unsafe {
2232        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2233        (0i32 == simd_reduce_or(r)) as i32
2234    }
2235}
2236
2237/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2238/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2239/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2240/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2241/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2242/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2243/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2244/// are zero, otherwise return 0.
2245///
2246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2247#[inline]
2248#[target_feature(enable = "avx")]
2249#[cfg_attr(test, assert_instr(vtestps))]
2250#[stable(feature = "simd_x86", since = "1.27.0")]
2251pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2252    unsafe { vtestnzcps(a, b) }
2253}
2254
2255/// Sets each bit of the returned mask based on the most significant bit of the
2256/// corresponding packed double-precision (64-bit) floating-point element in
2257/// `a`.
2258///
2259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2260#[inline]
2261#[target_feature(enable = "avx")]
2262#[cfg_attr(test, assert_instr(vmovmskpd))]
2263#[stable(feature = "simd_x86", since = "1.27.0")]
2264pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2265    // Propagate the highest bit to the rest, because simd_bitmask
2266    // requires all-1 or all-0.
2267    unsafe {
2268        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2269        simd_bitmask::<i64x4, u8>(mask).into()
2270    }
2271}
2272
2273/// Sets each bit of the returned mask based on the most significant bit of the
2274/// corresponding packed single-precision (32-bit) floating-point element in
2275/// `a`.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2278#[inline]
2279#[target_feature(enable = "avx")]
2280#[cfg_attr(test, assert_instr(vmovmskps))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2283    // Propagate the highest bit to the rest, because simd_bitmask
2284    // requires all-1 or all-0.
2285    unsafe {
2286        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2287        simd_bitmask::<i32x8, u8>(mask).into()
2288    }
2289}
2290
2291/// Returns vector of type __m256d with all elements set to zero.
2292///
2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2294#[inline]
2295#[target_feature(enable = "avx")]
2296#[cfg_attr(test, assert_instr(vxorp))]
2297#[stable(feature = "simd_x86", since = "1.27.0")]
2298pub fn _mm256_setzero_pd() -> __m256d {
2299    const { unsafe { mem::zeroed() } }
2300}
2301
2302/// Returns vector of type __m256 with all elements set to zero.
2303///
2304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2305#[inline]
2306#[target_feature(enable = "avx")]
2307#[cfg_attr(test, assert_instr(vxorps))]
2308#[stable(feature = "simd_x86", since = "1.27.0")]
2309pub fn _mm256_setzero_ps() -> __m256 {
2310    const { unsafe { mem::zeroed() } }
2311}
2312
2313/// Returns vector of type __m256i with all elements set to zero.
2314///
2315/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2316#[inline]
2317#[target_feature(enable = "avx")]
2318#[cfg_attr(test, assert_instr(vxor))]
2319#[stable(feature = "simd_x86", since = "1.27.0")]
2320pub fn _mm256_setzero_si256() -> __m256i {
2321    const { unsafe { mem::zeroed() } }
2322}
2323
2324/// Sets packed double-precision (64-bit) floating-point elements in returned
2325/// vector with the supplied values.
2326///
2327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2328#[inline]
2329#[target_feature(enable = "avx")]
2330// This intrinsic has no corresponding instruction.
2331#[cfg_attr(test, assert_instr(vinsertf128))]
2332#[stable(feature = "simd_x86", since = "1.27.0")]
2333pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2334    _mm256_setr_pd(d, c, b, a)
2335}
2336
2337/// Sets packed single-precision (32-bit) floating-point elements in returned
2338/// vector with the supplied values.
2339///
2340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2341#[inline]
2342#[target_feature(enable = "avx")]
2343// This intrinsic has no corresponding instruction.
2344#[stable(feature = "simd_x86", since = "1.27.0")]
2345pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2346    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2347}
2348
2349/// Sets packed 8-bit integers in returned vector with the supplied values.
2350///
2351/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2352#[inline]
2353#[target_feature(enable = "avx")]
2354// This intrinsic has no corresponding instruction.
2355#[stable(feature = "simd_x86", since = "1.27.0")]
2356pub fn _mm256_set_epi8(
2357    e00: i8,
2358    e01: i8,
2359    e02: i8,
2360    e03: i8,
2361    e04: i8,
2362    e05: i8,
2363    e06: i8,
2364    e07: i8,
2365    e08: i8,
2366    e09: i8,
2367    e10: i8,
2368    e11: i8,
2369    e12: i8,
2370    e13: i8,
2371    e14: i8,
2372    e15: i8,
2373    e16: i8,
2374    e17: i8,
2375    e18: i8,
2376    e19: i8,
2377    e20: i8,
2378    e21: i8,
2379    e22: i8,
2380    e23: i8,
2381    e24: i8,
2382    e25: i8,
2383    e26: i8,
2384    e27: i8,
2385    e28: i8,
2386    e29: i8,
2387    e30: i8,
2388    e31: i8,
2389) -> __m256i {
2390    #[rustfmt::skip]
2391    _mm256_setr_epi8(
2392        e31, e30, e29, e28, e27, e26, e25, e24,
2393        e23, e22, e21, e20, e19, e18, e17, e16,
2394        e15, e14, e13, e12, e11, e10, e09, e08,
2395        e07, e06, e05, e04, e03, e02, e01, e00,
2396    )
2397}
2398
2399/// Sets packed 16-bit integers in returned vector with the supplied values.
2400///
2401/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2402#[inline]
2403#[target_feature(enable = "avx")]
2404// This intrinsic has no corresponding instruction.
2405#[stable(feature = "simd_x86", since = "1.27.0")]
2406pub fn _mm256_set_epi16(
2407    e00: i16,
2408    e01: i16,
2409    e02: i16,
2410    e03: i16,
2411    e04: i16,
2412    e05: i16,
2413    e06: i16,
2414    e07: i16,
2415    e08: i16,
2416    e09: i16,
2417    e10: i16,
2418    e11: i16,
2419    e12: i16,
2420    e13: i16,
2421    e14: i16,
2422    e15: i16,
2423) -> __m256i {
2424    #[rustfmt::skip]
2425    _mm256_setr_epi16(
2426        e15, e14, e13, e12,
2427        e11, e10, e09, e08,
2428        e07, e06, e05, e04,
2429        e03, e02, e01, e00,
2430    )
2431}
2432
2433/// Sets packed 32-bit integers in returned vector with the supplied values.
2434///
2435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2436#[inline]
2437#[target_feature(enable = "avx")]
2438// This intrinsic has no corresponding instruction.
2439#[stable(feature = "simd_x86", since = "1.27.0")]
2440pub fn _mm256_set_epi32(
2441    e0: i32,
2442    e1: i32,
2443    e2: i32,
2444    e3: i32,
2445    e4: i32,
2446    e5: i32,
2447    e6: i32,
2448    e7: i32,
2449) -> __m256i {
2450    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2451}
2452
2453/// Sets packed 64-bit integers in returned vector with the supplied values.
2454///
2455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2456#[inline]
2457#[target_feature(enable = "avx")]
2458// This intrinsic has no corresponding instruction.
2459#[stable(feature = "simd_x86", since = "1.27.0")]
2460pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2461    _mm256_setr_epi64x(d, c, b, a)
2462}
2463
2464/// Sets packed double-precision (64-bit) floating-point elements in returned
2465/// vector with the supplied values in reverse order.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2468#[inline]
2469#[target_feature(enable = "avx")]
2470// This intrinsic has no corresponding instruction.
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2473    __m256d([a, b, c, d])
2474}
2475
2476/// Sets packed single-precision (32-bit) floating-point elements in returned
2477/// vector with the supplied values in reverse order.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2480#[inline]
2481#[target_feature(enable = "avx")]
2482// This intrinsic has no corresponding instruction.
2483#[stable(feature = "simd_x86", since = "1.27.0")]
2484pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2485    __m256([a, b, c, d, e, f, g, h])
2486}
2487
2488/// Sets packed 8-bit integers in returned vector with the supplied values in
2489/// reverse order.
2490///
2491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2492#[inline]
2493#[target_feature(enable = "avx")]
2494// This intrinsic has no corresponding instruction.
2495#[stable(feature = "simd_x86", since = "1.27.0")]
2496pub fn _mm256_setr_epi8(
2497    e00: i8,
2498    e01: i8,
2499    e02: i8,
2500    e03: i8,
2501    e04: i8,
2502    e05: i8,
2503    e06: i8,
2504    e07: i8,
2505    e08: i8,
2506    e09: i8,
2507    e10: i8,
2508    e11: i8,
2509    e12: i8,
2510    e13: i8,
2511    e14: i8,
2512    e15: i8,
2513    e16: i8,
2514    e17: i8,
2515    e18: i8,
2516    e19: i8,
2517    e20: i8,
2518    e21: i8,
2519    e22: i8,
2520    e23: i8,
2521    e24: i8,
2522    e25: i8,
2523    e26: i8,
2524    e27: i8,
2525    e28: i8,
2526    e29: i8,
2527    e30: i8,
2528    e31: i8,
2529) -> __m256i {
2530    unsafe {
2531        #[rustfmt::skip]
2532        transmute(i8x32::new(
2533            e00, e01, e02, e03, e04, e05, e06, e07,
2534            e08, e09, e10, e11, e12, e13, e14, e15,
2535            e16, e17, e18, e19, e20, e21, e22, e23,
2536            e24, e25, e26, e27, e28, e29, e30, e31,
2537        ))
2538    }
2539}
2540
2541/// Sets packed 16-bit integers in returned vector with the supplied values in
2542/// reverse order.
2543///
2544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2545#[inline]
2546#[target_feature(enable = "avx")]
2547// This intrinsic has no corresponding instruction.
2548#[stable(feature = "simd_x86", since = "1.27.0")]
2549pub fn _mm256_setr_epi16(
2550    e00: i16,
2551    e01: i16,
2552    e02: i16,
2553    e03: i16,
2554    e04: i16,
2555    e05: i16,
2556    e06: i16,
2557    e07: i16,
2558    e08: i16,
2559    e09: i16,
2560    e10: i16,
2561    e11: i16,
2562    e12: i16,
2563    e13: i16,
2564    e14: i16,
2565    e15: i16,
2566) -> __m256i {
2567    unsafe {
2568        #[rustfmt::skip]
2569        transmute(i16x16::new(
2570            e00, e01, e02, e03,
2571            e04, e05, e06, e07,
2572            e08, e09, e10, e11,
2573            e12, e13, e14, e15,
2574        ))
2575    }
2576}
2577
2578/// Sets packed 32-bit integers in returned vector with the supplied values in
2579/// reverse order.
2580///
2581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2582#[inline]
2583#[target_feature(enable = "avx")]
2584// This intrinsic has no corresponding instruction.
2585#[stable(feature = "simd_x86", since = "1.27.0")]
2586pub fn _mm256_setr_epi32(
2587    e0: i32,
2588    e1: i32,
2589    e2: i32,
2590    e3: i32,
2591    e4: i32,
2592    e5: i32,
2593    e6: i32,
2594    e7: i32,
2595) -> __m256i {
2596    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2597}
2598
2599/// Sets packed 64-bit integers in returned vector with the supplied values in
2600/// reverse order.
2601///
2602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2603#[inline]
2604#[target_feature(enable = "avx")]
2605// This intrinsic has no corresponding instruction.
2606#[stable(feature = "simd_x86", since = "1.27.0")]
2607pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2608    unsafe { transmute(i64x4::new(a, b, c, d)) }
2609}
2610
2611/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2612/// elements of returned vector.
2613///
2614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2615#[inline]
2616#[target_feature(enable = "avx")]
2617// This intrinsic has no corresponding instruction.
2618#[stable(feature = "simd_x86", since = "1.27.0")]
2619pub fn _mm256_set1_pd(a: f64) -> __m256d {
2620    _mm256_setr_pd(a, a, a, a)
2621}
2622
2623/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2624/// elements of returned vector.
2625///
2626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2627#[inline]
2628#[target_feature(enable = "avx")]
2629// This intrinsic has no corresponding instruction.
2630#[stable(feature = "simd_x86", since = "1.27.0")]
2631pub fn _mm256_set1_ps(a: f32) -> __m256 {
2632    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2633}
2634
2635/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2636/// This intrinsic may generate the `vpbroadcastb`.
2637///
2638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2639#[inline]
2640#[target_feature(enable = "avx")]
2641// This intrinsic has no corresponding instruction.
2642#[stable(feature = "simd_x86", since = "1.27.0")]
2643pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2644    #[rustfmt::skip]
2645    _mm256_setr_epi8(
2646        a, a, a, a, a, a, a, a,
2647        a, a, a, a, a, a, a, a,
2648        a, a, a, a, a, a, a, a,
2649        a, a, a, a, a, a, a, a,
2650    )
2651}
2652
2653/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2654/// This intrinsic may generate the `vpbroadcastw`.
2655///
2656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2657#[inline]
2658#[target_feature(enable = "avx")]
2659//#[cfg_attr(test, assert_instr(vpshufb))]
2660#[cfg_attr(test, assert_instr(vinsertf128))]
2661// This intrinsic has no corresponding instruction.
2662#[stable(feature = "simd_x86", since = "1.27.0")]
2663pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2664    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2665}
2666
2667/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2668/// This intrinsic may generate the `vpbroadcastd`.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2671#[inline]
2672#[target_feature(enable = "avx")]
2673// This intrinsic has no corresponding instruction.
2674#[stable(feature = "simd_x86", since = "1.27.0")]
2675pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2676    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2677}
2678
2679/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2680/// This intrinsic may generate the `vpbroadcastq`.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2683#[inline]
2684#[target_feature(enable = "avx")]
2685#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2686#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2687// This intrinsic has no corresponding instruction.
2688#[stable(feature = "simd_x86", since = "1.27.0")]
2689pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2690    _mm256_setr_epi64x(a, a, a, a)
2691}
2692
2693/// Cast vector of type __m256d to type __m256.
2694///
2695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2696#[inline]
2697#[target_feature(enable = "avx")]
2698// This intrinsic is only used for compilation and does not generate any
2699// instructions, thus it has zero latency.
2700#[stable(feature = "simd_x86", since = "1.27.0")]
2701pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2702    unsafe { transmute(a) }
2703}
2704
2705/// Cast vector of type __m256 to type __m256d.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic is only used for compilation and does not generate any
2711// instructions, thus it has zero latency.
2712#[stable(feature = "simd_x86", since = "1.27.0")]
2713pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2714    unsafe { transmute(a) }
2715}
2716
2717/// Casts vector of type __m256 to type __m256i.
2718///
2719/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2720#[inline]
2721#[target_feature(enable = "avx")]
2722// This intrinsic is only used for compilation and does not generate any
2723// instructions, thus it has zero latency.
2724#[stable(feature = "simd_x86", since = "1.27.0")]
2725pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2726    unsafe { transmute(a) }
2727}
2728
2729/// Casts vector of type __m256i to type __m256.
2730///
2731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2732#[inline]
2733#[target_feature(enable = "avx")]
2734// This intrinsic is only used for compilation and does not generate any
2735// instructions, thus it has zero latency.
2736#[stable(feature = "simd_x86", since = "1.27.0")]
2737pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2738    unsafe { transmute(a) }
2739}
2740
2741/// Casts vector of type __m256d to type __m256i.
2742///
2743/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2744#[inline]
2745#[target_feature(enable = "avx")]
2746// This intrinsic is only used for compilation and does not generate any
2747// instructions, thus it has zero latency.
2748#[stable(feature = "simd_x86", since = "1.27.0")]
2749pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2750    unsafe { transmute(a) }
2751}
2752
2753/// Casts vector of type __m256i to type __m256d.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2756#[inline]
2757#[target_feature(enable = "avx")]
2758// This intrinsic is only used for compilation and does not generate any
2759// instructions, thus it has zero latency.
2760#[stable(feature = "simd_x86", since = "1.27.0")]
2761pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2762    unsafe { transmute(a) }
2763}
2764
2765/// Casts vector of type __m256 to type __m128.
2766///
2767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2768#[inline]
2769#[target_feature(enable = "avx")]
2770// This intrinsic is only used for compilation and does not generate any
2771// instructions, thus it has zero latency.
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2774    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2775}
2776
2777/// Casts vector of type __m256d to type __m128d.
2778///
2779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2780#[inline]
2781#[target_feature(enable = "avx")]
2782// This intrinsic is only used for compilation and does not generate any
2783// instructions, thus it has zero latency.
2784#[stable(feature = "simd_x86", since = "1.27.0")]
2785pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2786    unsafe { simd_shuffle!(a, a, [0, 1]) }
2787}
2788
2789/// Casts vector of type __m256i to type __m128i.
2790///
2791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2792#[inline]
2793#[target_feature(enable = "avx")]
2794// This intrinsic is only used for compilation and does not generate any
2795// instructions, thus it has zero latency.
2796#[stable(feature = "simd_x86", since = "1.27.0")]
2797pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2798    unsafe {
2799        let a = a.as_i64x4();
2800        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2801        transmute(dst)
2802    }
2803}
2804
2805/// Casts vector of type __m128 to type __m256;
2806/// the upper 128 bits of the result are undefined.
2807///
2808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2809#[inline]
2810#[target_feature(enable = "avx")]
2811// This intrinsic is only used for compilation and does not generate any
2812// instructions, thus it has zero latency.
2813#[stable(feature = "simd_x86", since = "1.27.0")]
2814pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2815    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2816}
2817
2818/// Casts vector of type __m128d to type __m256d;
2819/// the upper 128 bits of the result are undefined.
2820///
2821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2822#[inline]
2823#[target_feature(enable = "avx")]
2824// This intrinsic is only used for compilation and does not generate any
2825// instructions, thus it has zero latency.
2826#[stable(feature = "simd_x86", since = "1.27.0")]
2827pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2828    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2829}
2830
2831/// Casts vector of type __m128i to type __m256i;
2832/// the upper 128 bits of the result are undefined.
2833///
2834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2835#[inline]
2836#[target_feature(enable = "avx")]
2837// This intrinsic is only used for compilation and does not generate any
2838// instructions, thus it has zero latency.
2839#[stable(feature = "simd_x86", since = "1.27.0")]
2840pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2841    unsafe {
2842        let a = a.as_i64x2();
2843        let undefined = i64x2::ZERO;
2844        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2845        transmute(dst)
2846    }
2847}
2848
2849/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2850/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2851/// the value of the source vector. The upper 128 bits are set to zero.
2852///
2853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2854#[inline]
2855#[target_feature(enable = "avx")]
2856// This intrinsic is only used for compilation and does not generate any
2857// instructions, thus it has zero latency.
2858#[stable(feature = "simd_x86", since = "1.27.0")]
2859pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2860    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2861}
2862
2863/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2864/// The lower 128 bits contain the value of the source vector. The upper
2865/// 128 bits are set to zero.
2866///
2867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2868#[inline]
2869#[target_feature(enable = "avx")]
2870// This intrinsic is only used for compilation and does not generate any
2871// instructions, thus it has zero latency.
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2874    unsafe {
2875        let b = i64x2::ZERO;
2876        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2877        transmute(dst)
2878    }
2879}
2880
2881/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2882/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2883/// contain the value of the source vector. The upper 128 bits are set
2884/// to zero.
2885///
2886/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2887#[inline]
2888#[target_feature(enable = "avx")]
2889// This intrinsic is only used for compilation and does not generate any
2890// instructions, thus it has zero latency.
2891#[stable(feature = "simd_x86", since = "1.27.0")]
2892pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2893    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2894}
2895
2896/// Returns vector of type `__m256` with indeterminate elements.
2897/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2898/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2899/// In practice, this is typically equivalent to [`mem::zeroed`].
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2902#[inline]
2903#[target_feature(enable = "avx")]
2904// This intrinsic has no corresponding instruction.
2905#[stable(feature = "simd_x86", since = "1.27.0")]
2906pub fn _mm256_undefined_ps() -> __m256 {
2907    const { unsafe { mem::zeroed() } }
2908}
2909
2910/// Returns vector of type `__m256d` with indeterminate elements.
2911/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2912/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2913/// In practice, this is typically equivalent to [`mem::zeroed`].
2914///
2915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2916#[inline]
2917#[target_feature(enable = "avx")]
2918// This intrinsic has no corresponding instruction.
2919#[stable(feature = "simd_x86", since = "1.27.0")]
2920pub fn _mm256_undefined_pd() -> __m256d {
2921    const { unsafe { mem::zeroed() } }
2922}
2923
2924/// Returns vector of type __m256i with with indeterminate elements.
2925/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2926/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2927/// In practice, this is typically equivalent to [`mem::zeroed`].
2928///
2929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2930#[inline]
2931#[target_feature(enable = "avx")]
2932// This intrinsic has no corresponding instruction.
2933#[stable(feature = "simd_x86", since = "1.27.0")]
2934pub fn _mm256_undefined_si256() -> __m256i {
2935    const { unsafe { mem::zeroed() } }
2936}
2937
2938/// Sets packed __m256 returned vector with the supplied values.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2941#[inline]
2942#[target_feature(enable = "avx")]
2943#[cfg_attr(test, assert_instr(vinsertf128))]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2946    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2947}
2948
2949/// Sets packed __m256d returned vector with the supplied values.
2950///
2951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2952#[inline]
2953#[target_feature(enable = "avx")]
2954#[cfg_attr(test, assert_instr(vinsertf128))]
2955#[stable(feature = "simd_x86", since = "1.27.0")]
2956pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2957    unsafe {
2958        let hi: __m128 = transmute(hi);
2959        let lo: __m128 = transmute(lo);
2960        transmute(_mm256_set_m128(hi, lo))
2961    }
2962}
2963
2964/// Sets packed __m256i returned vector with the supplied values.
2965///
2966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2967#[inline]
2968#[target_feature(enable = "avx")]
2969#[cfg_attr(test, assert_instr(vinsertf128))]
2970#[stable(feature = "simd_x86", since = "1.27.0")]
2971pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2972    unsafe {
2973        let hi: __m128 = transmute(hi);
2974        let lo: __m128 = transmute(lo);
2975        transmute(_mm256_set_m128(hi, lo))
2976    }
2977}
2978
2979/// Sets packed __m256 returned vector with the supplied values.
2980///
2981/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2982#[inline]
2983#[target_feature(enable = "avx")]
2984#[cfg_attr(test, assert_instr(vinsertf128))]
2985#[stable(feature = "simd_x86", since = "1.27.0")]
2986pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2987    _mm256_set_m128(hi, lo)
2988}
2989
2990/// Sets packed __m256d returned vector with the supplied values.
2991///
2992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2993#[inline]
2994#[target_feature(enable = "avx")]
2995#[cfg_attr(test, assert_instr(vinsertf128))]
2996#[stable(feature = "simd_x86", since = "1.27.0")]
2997pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2998    _mm256_set_m128d(hi, lo)
2999}
3000
3001/// Sets packed __m256i returned vector with the supplied values.
3002///
3003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
3004#[inline]
3005#[target_feature(enable = "avx")]
3006#[cfg_attr(test, assert_instr(vinsertf128))]
3007#[stable(feature = "simd_x86", since = "1.27.0")]
3008pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3009    _mm256_set_m128i(hi, lo)
3010}
3011
3012/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3013/// floating-point elements) from memory, and combine them into a 256-bit
3014/// value.
3015/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3016///
3017/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3018#[inline]
3019#[target_feature(enable = "avx")]
3020// This intrinsic has no corresponding instruction.
3021#[stable(feature = "simd_x86", since = "1.27.0")]
3022pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3023    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3024    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3025}
3026
3027/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3028/// floating-point elements) from memory, and combine them into a 256-bit
3029/// value.
3030/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3031///
3032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3033#[inline]
3034#[target_feature(enable = "avx")]
3035// This intrinsic has no corresponding instruction.
3036#[stable(feature = "simd_x86", since = "1.27.0")]
3037pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3038    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3039    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3040}
3041
3042/// Loads two 128-bit values (composed of integer data) from memory, and combine
3043/// them into a 256-bit value.
3044/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3045///
3046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3047#[inline]
3048#[target_feature(enable = "avx")]
3049// This intrinsic has no corresponding instruction.
3050#[stable(feature = "simd_x86", since = "1.27.0")]
3051pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3052    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3053    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3054}
3055
3056/// Stores the high and low 128-bit halves (each composed of 4 packed
3057/// single-precision (32-bit) floating-point elements) from `a` into memory two
3058/// different 128-bit locations.
3059/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3060///
3061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3062#[inline]
3063#[target_feature(enable = "avx")]
3064// This intrinsic has no corresponding instruction.
3065#[stable(feature = "simd_x86", since = "1.27.0")]
3066pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3067    let lo = _mm256_castps256_ps128(a);
3068    _mm_storeu_ps(loaddr, lo);
3069    let hi = _mm256_extractf128_ps::<1>(a);
3070    _mm_storeu_ps(hiaddr, hi);
3071}
3072
3073/// Stores the high and low 128-bit halves (each composed of 2 packed
3074/// double-precision (64-bit) floating-point elements) from `a` into memory two
3075/// different 128-bit locations.
3076/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3077///
3078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3079#[inline]
3080#[target_feature(enable = "avx")]
3081// This intrinsic has no corresponding instruction.
3082#[stable(feature = "simd_x86", since = "1.27.0")]
3083pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3084    let lo = _mm256_castpd256_pd128(a);
3085    _mm_storeu_pd(loaddr, lo);
3086    let hi = _mm256_extractf128_pd::<1>(a);
3087    _mm_storeu_pd(hiaddr, hi);
3088}
3089
3090/// Stores the high and low 128-bit halves (each composed of integer data) from
3091/// `a` into memory two different 128-bit locations.
3092/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3093///
3094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3095#[inline]
3096#[target_feature(enable = "avx")]
3097// This intrinsic has no corresponding instruction.
3098#[stable(feature = "simd_x86", since = "1.27.0")]
3099pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3100    let lo = _mm256_castsi256_si128(a);
3101    _mm_storeu_si128(loaddr, lo);
3102    let hi = _mm256_extractf128_si256::<1>(a);
3103    _mm_storeu_si128(hiaddr, hi);
3104}
3105
3106/// Returns the first element of the input vector of `[8 x float]`.
3107///
3108/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3109#[inline]
3110#[target_feature(enable = "avx")]
3111//#[cfg_attr(test, assert_instr(movss))] FIXME
3112#[stable(feature = "simd_x86", since = "1.27.0")]
3113pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3114    unsafe { simd_extract!(a, 0) }
3115}
3116
3117// LLVM intrinsics used in the above functions
3118#[allow(improper_ctypes)]
3119unsafe extern "C" {
3120    #[link_name = "llvm.x86.avx.round.pd.256"]
3121    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3122    #[link_name = "llvm.x86.avx.round.ps.256"]
3123    fn roundps256(a: __m256, b: i32) -> __m256;
3124    #[link_name = "llvm.x86.avx.dp.ps.256"]
3125    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3126    #[link_name = "llvm.x86.sse2.cmp.pd"]
3127    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3128    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3129    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3130    #[link_name = "llvm.x86.sse.cmp.ps"]
3131    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3132    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3133    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3134    #[link_name = "llvm.x86.sse2.cmp.sd"]
3135    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3136    #[link_name = "llvm.x86.sse.cmp.ss"]
3137    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3138    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3139    fn vcvtps2dq(a: __m256) -> i32x8;
3140    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3141    fn vcvttpd2dq(a: __m256d) -> i32x4;
3142    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3143    fn vcvtpd2dq(a: __m256d) -> i32x4;
3144    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3145    fn vcvttps2dq(a: __m256) -> i32x8;
3146    #[link_name = "llvm.x86.avx.vzeroall"]
3147    fn vzeroall();
3148    #[link_name = "llvm.x86.avx.vzeroupper"]
3149    fn vzeroupper();
3150    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3151    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3152    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3153    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3154    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3155    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3156    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3157    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3158    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3159    fn vlddqu(mem_addr: *const i8) -> i8x32;
3160    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3161    fn vrcpps(a: __m256) -> __m256;
3162    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3163    fn vrsqrtps(a: __m256) -> __m256;
3164    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3165    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3166    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3167    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3168    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3169    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3170    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3171    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3172    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3173    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3174    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3175    fn vtestzps256(a: __m256, b: __m256) -> i32;
3176    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3177    fn vtestcps256(a: __m256, b: __m256) -> i32;
3178    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3179    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3180    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3181    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3182    #[link_name = "llvm.x86.avx.min.ps.256"]
3183    fn vminps(a: __m256, b: __m256) -> __m256;
3184    #[link_name = "llvm.x86.avx.max.ps.256"]
3185    fn vmaxps(a: __m256, b: __m256) -> __m256;
3186    #[link_name = "llvm.x86.avx.min.pd.256"]
3187    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3188    #[link_name = "llvm.x86.avx.max.pd.256"]
3189    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3190}
3191
3192#[cfg(test)]
3193mod tests {
3194    use crate::hint::black_box;
3195    use crate::ptr;
3196    use stdarch_test::simd_test;
3197
3198    use crate::core_arch::x86::*;
3199
3200    #[simd_test(enable = "avx")]
3201    unsafe fn test_mm256_add_pd() {
3202        let a = _mm256_setr_pd(1., 2., 3., 4.);
3203        let b = _mm256_setr_pd(5., 6., 7., 8.);
3204        let r = _mm256_add_pd(a, b);
3205        let e = _mm256_setr_pd(6., 8., 10., 12.);
3206        assert_eq_m256d(r, e);
3207    }
3208
3209    #[simd_test(enable = "avx")]
3210    unsafe fn test_mm256_add_ps() {
3211        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3212        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3213        let r = _mm256_add_ps(a, b);
3214        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3215        assert_eq_m256(r, e);
3216    }
3217
3218    #[simd_test(enable = "avx")]
3219    unsafe fn test_mm256_and_pd() {
3220        let a = _mm256_set1_pd(1.);
3221        let b = _mm256_set1_pd(0.6);
3222        let r = _mm256_and_pd(a, b);
3223        let e = _mm256_set1_pd(0.5);
3224        assert_eq_m256d(r, e);
3225    }
3226
3227    #[simd_test(enable = "avx")]
3228    unsafe fn test_mm256_and_ps() {
3229        let a = _mm256_set1_ps(1.);
3230        let b = _mm256_set1_ps(0.6);
3231        let r = _mm256_and_ps(a, b);
3232        let e = _mm256_set1_ps(0.5);
3233        assert_eq_m256(r, e);
3234    }
3235
3236    #[simd_test(enable = "avx")]
3237    unsafe fn test_mm256_or_pd() {
3238        let a = _mm256_set1_pd(1.);
3239        let b = _mm256_set1_pd(0.6);
3240        let r = _mm256_or_pd(a, b);
3241        let e = _mm256_set1_pd(1.2);
3242        assert_eq_m256d(r, e);
3243    }
3244
3245    #[simd_test(enable = "avx")]
3246    unsafe fn test_mm256_or_ps() {
3247        let a = _mm256_set1_ps(1.);
3248        let b = _mm256_set1_ps(0.6);
3249        let r = _mm256_or_ps(a, b);
3250        let e = _mm256_set1_ps(1.2);
3251        assert_eq_m256(r, e);
3252    }
3253
3254    #[simd_test(enable = "avx")]
3255    unsafe fn test_mm256_shuffle_pd() {
3256        let a = _mm256_setr_pd(1., 4., 5., 8.);
3257        let b = _mm256_setr_pd(2., 3., 6., 7.);
3258        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3259        let e = _mm256_setr_pd(4., 3., 8., 7.);
3260        assert_eq_m256d(r, e);
3261    }
3262
3263    #[simd_test(enable = "avx")]
3264    unsafe fn test_mm256_shuffle_ps() {
3265        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3266        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3267        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3268        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3269        assert_eq_m256(r, e);
3270    }
3271
3272    #[simd_test(enable = "avx")]
3273    unsafe fn test_mm256_andnot_pd() {
3274        let a = _mm256_set1_pd(0.);
3275        let b = _mm256_set1_pd(0.6);
3276        let r = _mm256_andnot_pd(a, b);
3277        assert_eq_m256d(r, b);
3278    }
3279
3280    #[simd_test(enable = "avx")]
3281    unsafe fn test_mm256_andnot_ps() {
3282        let a = _mm256_set1_ps(0.);
3283        let b = _mm256_set1_ps(0.6);
3284        let r = _mm256_andnot_ps(a, b);
3285        assert_eq_m256(r, b);
3286    }
3287
3288    #[simd_test(enable = "avx")]
3289    unsafe fn test_mm256_max_pd() {
3290        let a = _mm256_setr_pd(1., 4., 5., 8.);
3291        let b = _mm256_setr_pd(2., 3., 6., 7.);
3292        let r = _mm256_max_pd(a, b);
3293        let e = _mm256_setr_pd(2., 4., 6., 8.);
3294        assert_eq_m256d(r, e);
3295        // > If the values being compared are both 0.0s (of either sign), the
3296        // > value in the second operand (source operand) is returned.
3297        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3298        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3299        let wu: [u64; 4] = transmute(w);
3300        let xu: [u64; 4] = transmute(x);
3301        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3302        assert_eq!(xu, [0u64; 4]);
3303        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3304        // > second operand (source operand), either a NaN or a valid
3305        // > floating-point value, is written to the result.
3306        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3307        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3308        let yf: [f64; 4] = transmute(y);
3309        let zf: [f64; 4] = transmute(z);
3310        assert_eq!(yf, [0.0; 4]);
3311        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3312    }
3313
3314    #[simd_test(enable = "avx")]
3315    unsafe fn test_mm256_max_ps() {
3316        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3317        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3318        let r = _mm256_max_ps(a, b);
3319        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3320        assert_eq_m256(r, e);
3321        // > If the values being compared are both 0.0s (of either sign), the
3322        // > value in the second operand (source operand) is returned.
3323        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3324        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3325        let wu: [u32; 8] = transmute(w);
3326        let xu: [u32; 8] = transmute(x);
3327        assert_eq!(wu, [0x8000_0000u32; 8]);
3328        assert_eq!(xu, [0u32; 8]);
3329        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3330        // > second operand (source operand), either a NaN or a valid
3331        // > floating-point value, is written to the result.
3332        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3333        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3334        let yf: [f32; 8] = transmute(y);
3335        let zf: [f32; 8] = transmute(z);
3336        assert_eq!(yf, [0.0; 8]);
3337        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3338    }
3339
3340    #[simd_test(enable = "avx")]
3341    unsafe fn test_mm256_min_pd() {
3342        let a = _mm256_setr_pd(1., 4., 5., 8.);
3343        let b = _mm256_setr_pd(2., 3., 6., 7.);
3344        let r = _mm256_min_pd(a, b);
3345        let e = _mm256_setr_pd(1., 3., 5., 7.);
3346        assert_eq_m256d(r, e);
3347        // > If the values being compared are both 0.0s (of either sign), the
3348        // > value in the second operand (source operand) is returned.
3349        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3350        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3351        let wu: [u64; 4] = transmute(w);
3352        let xu: [u64; 4] = transmute(x);
3353        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3354        assert_eq!(xu, [0u64; 4]);
3355        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3356        // > second operand (source operand), either a NaN or a valid
3357        // > floating-point value, is written to the result.
3358        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3359        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3360        let yf: [f64; 4] = transmute(y);
3361        let zf: [f64; 4] = transmute(z);
3362        assert_eq!(yf, [0.0; 4]);
3363        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3364    }
3365
3366    #[simd_test(enable = "avx")]
3367    unsafe fn test_mm256_min_ps() {
3368        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3369        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3370        let r = _mm256_min_ps(a, b);
3371        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3372        assert_eq_m256(r, e);
3373        // > If the values being compared are both 0.0s (of either sign), the
3374        // > value in the second operand (source operand) is returned.
3375        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3376        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3377        let wu: [u32; 8] = transmute(w);
3378        let xu: [u32; 8] = transmute(x);
3379        assert_eq!(wu, [0x8000_0000u32; 8]);
3380        assert_eq!(xu, [0u32; 8]);
3381        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3382        // > second operand (source operand), either a NaN or a valid
3383        // > floating-point value, is written to the result.
3384        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3385        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3386        let yf: [f32; 8] = transmute(y);
3387        let zf: [f32; 8] = transmute(z);
3388        assert_eq!(yf, [0.0; 8]);
3389        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3390    }
3391
3392    #[simd_test(enable = "avx")]
3393    unsafe fn test_mm256_mul_pd() {
3394        let a = _mm256_setr_pd(1., 2., 3., 4.);
3395        let b = _mm256_setr_pd(5., 6., 7., 8.);
3396        let r = _mm256_mul_pd(a, b);
3397        let e = _mm256_setr_pd(5., 12., 21., 32.);
3398        assert_eq_m256d(r, e);
3399    }
3400
3401    #[simd_test(enable = "avx")]
3402    unsafe fn test_mm256_mul_ps() {
3403        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3404        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3405        let r = _mm256_mul_ps(a, b);
3406        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3407        assert_eq_m256(r, e);
3408    }
3409
3410    #[simd_test(enable = "avx")]
3411    unsafe fn test_mm256_addsub_pd() {
3412        let a = _mm256_setr_pd(1., 2., 3., 4.);
3413        let b = _mm256_setr_pd(5., 6., 7., 8.);
3414        let r = _mm256_addsub_pd(a, b);
3415        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3416        assert_eq_m256d(r, e);
3417    }
3418
3419    #[simd_test(enable = "avx")]
3420    unsafe fn test_mm256_addsub_ps() {
3421        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3422        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3423        let r = _mm256_addsub_ps(a, b);
3424        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3425        assert_eq_m256(r, e);
3426    }
3427
3428    #[simd_test(enable = "avx")]
3429    unsafe fn test_mm256_sub_pd() {
3430        let a = _mm256_setr_pd(1., 2., 3., 4.);
3431        let b = _mm256_setr_pd(5., 6., 7., 8.);
3432        let r = _mm256_sub_pd(a, b);
3433        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3434        assert_eq_m256d(r, e);
3435    }
3436
3437    #[simd_test(enable = "avx")]
3438    unsafe fn test_mm256_sub_ps() {
3439        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3440        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3441        let r = _mm256_sub_ps(a, b);
3442        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3443        assert_eq_m256(r, e);
3444    }
3445
3446    #[simd_test(enable = "avx")]
3447    unsafe fn test_mm256_round_pd() {
3448        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3449        let result_closest = _mm256_round_pd::<0b0000>(a);
3450        let result_down = _mm256_round_pd::<0b0001>(a);
3451        let result_up = _mm256_round_pd::<0b0010>(a);
3452        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3453        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3454        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3455        assert_eq_m256d(result_closest, expected_closest);
3456        assert_eq_m256d(result_down, expected_down);
3457        assert_eq_m256d(result_up, expected_up);
3458    }
3459
3460    #[simd_test(enable = "avx")]
3461    unsafe fn test_mm256_floor_pd() {
3462        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3463        let result_down = _mm256_floor_pd(a);
3464        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3465        assert_eq_m256d(result_down, expected_down);
3466    }
3467
3468    #[simd_test(enable = "avx")]
3469    unsafe fn test_mm256_ceil_pd() {
3470        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3471        let result_up = _mm256_ceil_pd(a);
3472        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3473        assert_eq_m256d(result_up, expected_up);
3474    }
3475
3476    #[simd_test(enable = "avx")]
3477    unsafe fn test_mm256_round_ps() {
3478        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3479        let result_closest = _mm256_round_ps::<0b0000>(a);
3480        let result_down = _mm256_round_ps::<0b0001>(a);
3481        let result_up = _mm256_round_ps::<0b0010>(a);
3482        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3483        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3484        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3485        assert_eq_m256(result_closest, expected_closest);
3486        assert_eq_m256(result_down, expected_down);
3487        assert_eq_m256(result_up, expected_up);
3488    }
3489
3490    #[simd_test(enable = "avx")]
3491    unsafe fn test_mm256_floor_ps() {
3492        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3493        let result_down = _mm256_floor_ps(a);
3494        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3495        assert_eq_m256(result_down, expected_down);
3496    }
3497
3498    #[simd_test(enable = "avx")]
3499    unsafe fn test_mm256_ceil_ps() {
3500        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3501        let result_up = _mm256_ceil_ps(a);
3502        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3503        assert_eq_m256(result_up, expected_up);
3504    }
3505
3506    #[simd_test(enable = "avx")]
3507    unsafe fn test_mm256_sqrt_pd() {
3508        let a = _mm256_setr_pd(4., 9., 16., 25.);
3509        let r = _mm256_sqrt_pd(a);
3510        let e = _mm256_setr_pd(2., 3., 4., 5.);
3511        assert_eq_m256d(r, e);
3512    }
3513
3514    #[simd_test(enable = "avx")]
3515    unsafe fn test_mm256_sqrt_ps() {
3516        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3517        let r = _mm256_sqrt_ps(a);
3518        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3519        assert_eq_m256(r, e);
3520    }
3521
3522    #[simd_test(enable = "avx")]
3523    unsafe fn test_mm256_div_ps() {
3524        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3525        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3526        let r = _mm256_div_ps(a, b);
3527        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3528        assert_eq_m256(r, e);
3529    }
3530
3531    #[simd_test(enable = "avx")]
3532    unsafe fn test_mm256_div_pd() {
3533        let a = _mm256_setr_pd(4., 9., 16., 25.);
3534        let b = _mm256_setr_pd(4., 3., 2., 5.);
3535        let r = _mm256_div_pd(a, b);
3536        let e = _mm256_setr_pd(1., 3., 8., 5.);
3537        assert_eq_m256d(r, e);
3538    }
3539
3540    #[simd_test(enable = "avx")]
3541    unsafe fn test_mm256_blend_pd() {
3542        let a = _mm256_setr_pd(4., 9., 16., 25.);
3543        let b = _mm256_setr_pd(4., 3., 2., 5.);
3544        let r = _mm256_blend_pd::<0x0>(a, b);
3545        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3546        let r = _mm256_blend_pd::<0x3>(a, b);
3547        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3548        let r = _mm256_blend_pd::<0xF>(a, b);
3549        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3550    }
3551
3552    #[simd_test(enable = "avx")]
3553    unsafe fn test_mm256_blend_ps() {
3554        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3555        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3556        let r = _mm256_blend_ps::<0x0>(a, b);
3557        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3558        let r = _mm256_blend_ps::<0x3>(a, b);
3559        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3560        let r = _mm256_blend_ps::<0xF>(a, b);
3561        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3562    }
3563
3564    #[simd_test(enable = "avx")]
3565    unsafe fn test_mm256_blendv_pd() {
3566        let a = _mm256_setr_pd(4., 9., 16., 25.);
3567        let b = _mm256_setr_pd(4., 3., 2., 5.);
3568        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3569        let r = _mm256_blendv_pd(a, b, c);
3570        let e = _mm256_setr_pd(4., 9., 2., 5.);
3571        assert_eq_m256d(r, e);
3572    }
3573
3574    #[simd_test(enable = "avx")]
3575    unsafe fn test_mm256_blendv_ps() {
3576        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3577        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3578        #[rustfmt::skip]
3579        let c = _mm256_setr_ps(
3580            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3581        );
3582        let r = _mm256_blendv_ps(a, b, c);
3583        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3584        assert_eq_m256(r, e);
3585    }
3586
3587    #[simd_test(enable = "avx")]
3588    unsafe fn test_mm256_dp_ps() {
3589        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3590        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3591        let r = _mm256_dp_ps::<0xFF>(a, b);
3592        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3593        assert_eq_m256(r, e);
3594    }
3595
3596    #[simd_test(enable = "avx")]
3597    unsafe fn test_mm256_hadd_pd() {
3598        let a = _mm256_setr_pd(4., 9., 16., 25.);
3599        let b = _mm256_setr_pd(4., 3., 2., 5.);
3600        let r = _mm256_hadd_pd(a, b);
3601        let e = _mm256_setr_pd(13., 7., 41., 7.);
3602        assert_eq_m256d(r, e);
3603
3604        let a = _mm256_setr_pd(1., 2., 3., 4.);
3605        let b = _mm256_setr_pd(5., 6., 7., 8.);
3606        let r = _mm256_hadd_pd(a, b);
3607        let e = _mm256_setr_pd(3., 11., 7., 15.);
3608        assert_eq_m256d(r, e);
3609    }
3610
3611    #[simd_test(enable = "avx")]
3612    unsafe fn test_mm256_hadd_ps() {
3613        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3614        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3615        let r = _mm256_hadd_ps(a, b);
3616        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3617        assert_eq_m256(r, e);
3618
3619        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3620        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3621        let r = _mm256_hadd_ps(a, b);
3622        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3623        assert_eq_m256(r, e);
3624    }
3625
3626    #[simd_test(enable = "avx")]
3627    unsafe fn test_mm256_hsub_pd() {
3628        let a = _mm256_setr_pd(4., 9., 16., 25.);
3629        let b = _mm256_setr_pd(4., 3., 2., 5.);
3630        let r = _mm256_hsub_pd(a, b);
3631        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3632        assert_eq_m256d(r, e);
3633
3634        let a = _mm256_setr_pd(1., 2., 3., 4.);
3635        let b = _mm256_setr_pd(5., 6., 7., 8.);
3636        let r = _mm256_hsub_pd(a, b);
3637        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3638        assert_eq_m256d(r, e);
3639    }
3640
3641    #[simd_test(enable = "avx")]
3642    unsafe fn test_mm256_hsub_ps() {
3643        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3644        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3645        let r = _mm256_hsub_ps(a, b);
3646        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3647        assert_eq_m256(r, e);
3648
3649        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3650        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3651        let r = _mm256_hsub_ps(a, b);
3652        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3653        assert_eq_m256(r, e);
3654    }
3655
3656    #[simd_test(enable = "avx")]
3657    unsafe fn test_mm256_xor_pd() {
3658        let a = _mm256_setr_pd(4., 9., 16., 25.);
3659        let b = _mm256_set1_pd(0.);
3660        let r = _mm256_xor_pd(a, b);
3661        assert_eq_m256d(r, a);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    unsafe fn test_mm256_xor_ps() {
3666        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3667        let b = _mm256_set1_ps(0.);
3668        let r = _mm256_xor_ps(a, b);
3669        assert_eq_m256(r, a);
3670    }
3671
3672    #[simd_test(enable = "avx")]
3673    unsafe fn test_mm_cmp_pd() {
3674        let a = _mm_setr_pd(4., 9.);
3675        let b = _mm_setr_pd(4., 3.);
3676        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3677        assert!(get_m128d(r, 0).is_nan());
3678        assert!(get_m128d(r, 1).is_nan());
3679    }
3680
3681    #[simd_test(enable = "avx")]
3682    unsafe fn test_mm256_cmp_pd() {
3683        let a = _mm256_setr_pd(1., 2., 3., 4.);
3684        let b = _mm256_setr_pd(5., 6., 7., 8.);
3685        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3686        let e = _mm256_set1_pd(0.);
3687        assert_eq_m256d(r, e);
3688    }
3689
3690    #[simd_test(enable = "avx")]
3691    unsafe fn test_mm_cmp_ps() {
3692        let a = _mm_setr_ps(4., 3., 2., 5.);
3693        let b = _mm_setr_ps(4., 9., 16., 25.);
3694        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3695        assert!(get_m128(r, 0).is_nan());
3696        assert_eq!(get_m128(r, 1), 0.);
3697        assert_eq!(get_m128(r, 2), 0.);
3698        assert_eq!(get_m128(r, 3), 0.);
3699    }
3700
3701    #[simd_test(enable = "avx")]
3702    unsafe fn test_mm256_cmp_ps() {
3703        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3704        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3705        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3706        let e = _mm256_set1_ps(0.);
3707        assert_eq_m256(r, e);
3708    }
3709
3710    #[simd_test(enable = "avx")]
3711    unsafe fn test_mm_cmp_sd() {
3712        let a = _mm_setr_pd(4., 9.);
3713        let b = _mm_setr_pd(4., 3.);
3714        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3715        assert!(get_m128d(r, 0).is_nan());
3716        assert_eq!(get_m128d(r, 1), 9.);
3717    }
3718
3719    #[simd_test(enable = "avx")]
3720    unsafe fn test_mm_cmp_ss() {
3721        let a = _mm_setr_ps(4., 3., 2., 5.);
3722        let b = _mm_setr_ps(4., 9., 16., 25.);
3723        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3724        assert!(get_m128(r, 0).is_nan());
3725        assert_eq!(get_m128(r, 1), 3.);
3726        assert_eq!(get_m128(r, 2), 2.);
3727        assert_eq!(get_m128(r, 3), 5.);
3728    }
3729
3730    #[simd_test(enable = "avx")]
3731    unsafe fn test_mm256_cvtepi32_pd() {
3732        let a = _mm_setr_epi32(4, 9, 16, 25);
3733        let r = _mm256_cvtepi32_pd(a);
3734        let e = _mm256_setr_pd(4., 9., 16., 25.);
3735        assert_eq_m256d(r, e);
3736    }
3737
3738    #[simd_test(enable = "avx")]
3739    unsafe fn test_mm256_cvtepi32_ps() {
3740        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3741        let r = _mm256_cvtepi32_ps(a);
3742        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3743        assert_eq_m256(r, e);
3744    }
3745
3746    #[simd_test(enable = "avx")]
3747    unsafe fn test_mm256_cvtpd_ps() {
3748        let a = _mm256_setr_pd(4., 9., 16., 25.);
3749        let r = _mm256_cvtpd_ps(a);
3750        let e = _mm_setr_ps(4., 9., 16., 25.);
3751        assert_eq_m128(r, e);
3752    }
3753
3754    #[simd_test(enable = "avx")]
3755    unsafe fn test_mm256_cvtps_epi32() {
3756        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3757        let r = _mm256_cvtps_epi32(a);
3758        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3759        assert_eq_m256i(r, e);
3760    }
3761
3762    #[simd_test(enable = "avx")]
3763    unsafe fn test_mm256_cvtps_pd() {
3764        let a = _mm_setr_ps(4., 9., 16., 25.);
3765        let r = _mm256_cvtps_pd(a);
3766        let e = _mm256_setr_pd(4., 9., 16., 25.);
3767        assert_eq_m256d(r, e);
3768    }
3769
3770    #[simd_test(enable = "avx")]
3771    unsafe fn test_mm256_cvtsd_f64() {
3772        let a = _mm256_setr_pd(1., 2., 3., 4.);
3773        let r = _mm256_cvtsd_f64(a);
3774        assert_eq!(r, 1.);
3775    }
3776
3777    #[simd_test(enable = "avx")]
3778    unsafe fn test_mm256_cvttpd_epi32() {
3779        let a = _mm256_setr_pd(4., 9., 16., 25.);
3780        let r = _mm256_cvttpd_epi32(a);
3781        let e = _mm_setr_epi32(4, 9, 16, 25);
3782        assert_eq_m128i(r, e);
3783    }
3784
3785    #[simd_test(enable = "avx")]
3786    unsafe fn test_mm256_cvtpd_epi32() {
3787        let a = _mm256_setr_pd(4., 9., 16., 25.);
3788        let r = _mm256_cvtpd_epi32(a);
3789        let e = _mm_setr_epi32(4, 9, 16, 25);
3790        assert_eq_m128i(r, e);
3791    }
3792
3793    #[simd_test(enable = "avx")]
3794    unsafe fn test_mm256_cvttps_epi32() {
3795        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3796        let r = _mm256_cvttps_epi32(a);
3797        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3798        assert_eq_m256i(r, e);
3799    }
3800
3801    #[simd_test(enable = "avx")]
3802    unsafe fn test_mm256_extractf128_ps() {
3803        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3804        let r = _mm256_extractf128_ps::<0>(a);
3805        let e = _mm_setr_ps(4., 3., 2., 5.);
3806        assert_eq_m128(r, e);
3807    }
3808
3809    #[simd_test(enable = "avx")]
3810    unsafe fn test_mm256_extractf128_pd() {
3811        let a = _mm256_setr_pd(4., 3., 2., 5.);
3812        let r = _mm256_extractf128_pd::<0>(a);
3813        let e = _mm_setr_pd(4., 3.);
3814        assert_eq_m128d(r, e);
3815    }
3816
3817    #[simd_test(enable = "avx")]
3818    unsafe fn test_mm256_extractf128_si256() {
3819        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3820        let r = _mm256_extractf128_si256::<0>(a);
3821        let e = _mm_setr_epi64x(4, 3);
3822        assert_eq_m128i(r, e);
3823    }
3824
3825    #[simd_test(enable = "avx")]
3826    unsafe fn test_mm256_extract_epi32() {
3827        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3828        let r1 = _mm256_extract_epi32::<0>(a);
3829        let r2 = _mm256_extract_epi32::<3>(a);
3830        assert_eq!(r1, -1);
3831        assert_eq!(r2, 3);
3832    }
3833
3834    #[simd_test(enable = "avx")]
3835    unsafe fn test_mm256_cvtsi256_si32() {
3836        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3837        let r = _mm256_cvtsi256_si32(a);
3838        assert_eq!(r, 1);
3839    }
3840
3841    #[simd_test(enable = "avx")]
3842    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3843    unsafe fn test_mm256_zeroall() {
3844        _mm256_zeroall();
3845    }
3846
3847    #[simd_test(enable = "avx")]
3848    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3849    unsafe fn test_mm256_zeroupper() {
3850        _mm256_zeroupper();
3851    }
3852
3853    #[simd_test(enable = "avx")]
3854    unsafe fn test_mm256_permutevar_ps() {
3855        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3856        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3857        let r = _mm256_permutevar_ps(a, b);
3858        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3859        assert_eq_m256(r, e);
3860    }
3861
3862    #[simd_test(enable = "avx")]
3863    unsafe fn test_mm_permutevar_ps() {
3864        let a = _mm_setr_ps(4., 3., 2., 5.);
3865        let b = _mm_setr_epi32(1, 2, 3, 4);
3866        let r = _mm_permutevar_ps(a, b);
3867        let e = _mm_setr_ps(3., 2., 5., 4.);
3868        assert_eq_m128(r, e);
3869    }
3870
3871    #[simd_test(enable = "avx")]
3872    unsafe fn test_mm256_permute_ps() {
3873        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3874        let r = _mm256_permute_ps::<0x1b>(a);
3875        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3876        assert_eq_m256(r, e);
3877    }
3878
3879    #[simd_test(enable = "avx")]
3880    unsafe fn test_mm_permute_ps() {
3881        let a = _mm_setr_ps(4., 3., 2., 5.);
3882        let r = _mm_permute_ps::<0x1b>(a);
3883        let e = _mm_setr_ps(5., 2., 3., 4.);
3884        assert_eq_m128(r, e);
3885    }
3886
3887    #[simd_test(enable = "avx")]
3888    unsafe fn test_mm256_permutevar_pd() {
3889        let a = _mm256_setr_pd(4., 3., 2., 5.);
3890        let b = _mm256_setr_epi64x(1, 2, 3, 4);
3891        let r = _mm256_permutevar_pd(a, b);
3892        let e = _mm256_setr_pd(4., 3., 5., 2.);
3893        assert_eq_m256d(r, e);
3894    }
3895
3896    #[simd_test(enable = "avx")]
3897    unsafe fn test_mm_permutevar_pd() {
3898        let a = _mm_setr_pd(4., 3.);
3899        let b = _mm_setr_epi64x(3, 0);
3900        let r = _mm_permutevar_pd(a, b);
3901        let e = _mm_setr_pd(3., 4.);
3902        assert_eq_m128d(r, e);
3903    }
3904
3905    #[simd_test(enable = "avx")]
3906    unsafe fn test_mm256_permute_pd() {
3907        let a = _mm256_setr_pd(4., 3., 2., 5.);
3908        let r = _mm256_permute_pd::<5>(a);
3909        let e = _mm256_setr_pd(3., 4., 5., 2.);
3910        assert_eq_m256d(r, e);
3911    }
3912
3913    #[simd_test(enable = "avx")]
3914    unsafe fn test_mm_permute_pd() {
3915        let a = _mm_setr_pd(4., 3.);
3916        let r = _mm_permute_pd::<1>(a);
3917        let e = _mm_setr_pd(3., 4.);
3918        assert_eq_m128d(r, e);
3919    }
3920
3921    #[simd_test(enable = "avx")]
3922    unsafe fn test_mm256_permute2f128_ps() {
3923        let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.);
3924        let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.);
3925        let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b);
3926        let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.);
3927        assert_eq_m256(r, e);
3928
3929        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3930        let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b);
3931        let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
3932        assert_eq_m256(r, z);
3933    }
3934
3935    #[simd_test(enable = "avx")]
3936    unsafe fn test_mm256_permute2f128_pd() {
3937        let a = _mm256_setr_pd(1., 2., 3., 4.);
3938        let b = _mm256_setr_pd(5., 6., 7., 8.);
3939        let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b);
3940        let e = _mm256_setr_pd(3., 4., 7., 8.);
3941        assert_eq_m256d(r, e);
3942
3943        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3944        let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b);
3945        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
3946        assert_eq_m256d(r, e);
3947    }
3948
3949    #[simd_test(enable = "avx")]
3950    unsafe fn test_mm256_permute2f128_si256() {
3951        let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18);
3952        let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28);
3953        let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b);
3954        let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24);
3955        assert_eq_m256i(r, e);
3956
3957        // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field.
3958        let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b);
3959        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
3960        assert_eq_m256i(r, e);
3961    }
3962
3963    #[simd_test(enable = "avx")]
3964    unsafe fn test_mm256_broadcast_ss() {
3965        let r = _mm256_broadcast_ss(&3.);
3966        let e = _mm256_set1_ps(3.);
3967        assert_eq_m256(r, e);
3968    }
3969
3970    #[simd_test(enable = "avx")]
3971    unsafe fn test_mm_broadcast_ss() {
3972        let r = _mm_broadcast_ss(&3.);
3973        let e = _mm_set1_ps(3.);
3974        assert_eq_m128(r, e);
3975    }
3976
3977    #[simd_test(enable = "avx")]
3978    unsafe fn test_mm256_broadcast_sd() {
3979        let r = _mm256_broadcast_sd(&3.);
3980        let e = _mm256_set1_pd(3.);
3981        assert_eq_m256d(r, e);
3982    }
3983
3984    #[simd_test(enable = "avx")]
3985    unsafe fn test_mm256_broadcast_ps() {
3986        let a = _mm_setr_ps(4., 3., 2., 5.);
3987        let r = _mm256_broadcast_ps(&a);
3988        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3989        assert_eq_m256(r, e);
3990    }
3991
3992    #[simd_test(enable = "avx")]
3993    unsafe fn test_mm256_broadcast_pd() {
3994        let a = _mm_setr_pd(4., 3.);
3995        let r = _mm256_broadcast_pd(&a);
3996        let e = _mm256_setr_pd(4., 3., 4., 3.);
3997        assert_eq_m256d(r, e);
3998    }
3999
4000    #[simd_test(enable = "avx")]
4001    unsafe fn test_mm256_insertf128_ps() {
4002        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4003        let b = _mm_setr_ps(4., 9., 16., 25.);
4004        let r = _mm256_insertf128_ps::<0>(a, b);
4005        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
4006        assert_eq_m256(r, e);
4007    }
4008
4009    #[simd_test(enable = "avx")]
4010    unsafe fn test_mm256_insertf128_pd() {
4011        let a = _mm256_setr_pd(1., 2., 3., 4.);
4012        let b = _mm_setr_pd(5., 6.);
4013        let r = _mm256_insertf128_pd::<0>(a, b);
4014        let e = _mm256_setr_pd(5., 6., 3., 4.);
4015        assert_eq_m256d(r, e);
4016    }
4017
4018    #[simd_test(enable = "avx")]
4019    unsafe fn test_mm256_insertf128_si256() {
4020        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4021        let b = _mm_setr_epi64x(5, 6);
4022        let r = _mm256_insertf128_si256::<0>(a, b);
4023        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4024        assert_eq_m256i(r, e);
4025    }
4026
4027    #[simd_test(enable = "avx")]
4028    unsafe fn test_mm256_insert_epi8() {
4029        #[rustfmt::skip]
4030        let a = _mm256_setr_epi8(
4031            1, 2, 3, 4, 5, 6, 7, 8,
4032            9, 10, 11, 12, 13, 14, 15, 16,
4033            17, 18, 19, 20, 21, 22, 23, 24,
4034            25, 26, 27, 28, 29, 30, 31, 32,
4035        );
4036        let r = _mm256_insert_epi8::<31>(a, 0);
4037        #[rustfmt::skip]
4038        let e = _mm256_setr_epi8(
4039            1, 2, 3, 4, 5, 6, 7, 8,
4040            9, 10, 11, 12, 13, 14, 15, 16,
4041            17, 18, 19, 20, 21, 22, 23, 24,
4042            25, 26, 27, 28, 29, 30, 31, 0,
4043        );
4044        assert_eq_m256i(r, e);
4045    }
4046
4047    #[simd_test(enable = "avx")]
4048    unsafe fn test_mm256_insert_epi16() {
4049        #[rustfmt::skip]
4050        let a = _mm256_setr_epi16(
4051            0, 1, 2, 3, 4, 5, 6, 7,
4052            8, 9, 10, 11, 12, 13, 14, 15,
4053        );
4054        let r = _mm256_insert_epi16::<15>(a, 0);
4055        #[rustfmt::skip]
4056        let e = _mm256_setr_epi16(
4057            0, 1, 2, 3, 4, 5, 6, 7,
4058            8, 9, 10, 11, 12, 13, 14, 0,
4059        );
4060        assert_eq_m256i(r, e);
4061    }
4062
4063    #[simd_test(enable = "avx")]
4064    unsafe fn test_mm256_insert_epi32() {
4065        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4066        let r = _mm256_insert_epi32::<7>(a, 0);
4067        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4068        assert_eq_m256i(r, e);
4069    }
4070
4071    #[simd_test(enable = "avx")]
4072    unsafe fn test_mm256_load_pd() {
4073        let a = _mm256_setr_pd(1., 2., 3., 4.);
4074        let p = ptr::addr_of!(a) as *const f64;
4075        let r = _mm256_load_pd(p);
4076        let e = _mm256_setr_pd(1., 2., 3., 4.);
4077        assert_eq_m256d(r, e);
4078    }
4079
4080    #[simd_test(enable = "avx")]
4081    unsafe fn test_mm256_store_pd() {
4082        let a = _mm256_setr_pd(1., 2., 3., 4.);
4083        let mut r = _mm256_undefined_pd();
4084        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4085        assert_eq_m256d(r, a);
4086    }
4087
4088    #[simd_test(enable = "avx")]
4089    unsafe fn test_mm256_load_ps() {
4090        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4091        let p = ptr::addr_of!(a) as *const f32;
4092        let r = _mm256_load_ps(p);
4093        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4094        assert_eq_m256(r, e);
4095    }
4096
4097    #[simd_test(enable = "avx")]
4098    unsafe fn test_mm256_store_ps() {
4099        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4100        let mut r = _mm256_undefined_ps();
4101        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4102        assert_eq_m256(r, a);
4103    }
4104
4105    #[simd_test(enable = "avx")]
4106    unsafe fn test_mm256_loadu_pd() {
4107        let a = &[1.0f64, 2., 3., 4.];
4108        let p = a.as_ptr();
4109        let r = _mm256_loadu_pd(black_box(p));
4110        let e = _mm256_setr_pd(1., 2., 3., 4.);
4111        assert_eq_m256d(r, e);
4112    }
4113
4114    #[simd_test(enable = "avx")]
4115    unsafe fn test_mm256_storeu_pd() {
4116        let a = _mm256_set1_pd(9.);
4117        let mut r = _mm256_undefined_pd();
4118        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4119        assert_eq_m256d(r, a);
4120    }
4121
4122    #[simd_test(enable = "avx")]
4123    unsafe fn test_mm256_loadu_ps() {
4124        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4125        let p = a.as_ptr();
4126        let r = _mm256_loadu_ps(black_box(p));
4127        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4128        assert_eq_m256(r, e);
4129    }
4130
4131    #[simd_test(enable = "avx")]
4132    unsafe fn test_mm256_storeu_ps() {
4133        let a = _mm256_set1_ps(9.);
4134        let mut r = _mm256_undefined_ps();
4135        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4136        assert_eq_m256(r, a);
4137    }
4138
4139    #[simd_test(enable = "avx")]
4140    unsafe fn test_mm256_load_si256() {
4141        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4142        let p = ptr::addr_of!(a);
4143        let r = _mm256_load_si256(p);
4144        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4145        assert_eq_m256i(r, e);
4146    }
4147
4148    #[simd_test(enable = "avx")]
4149    unsafe fn test_mm256_store_si256() {
4150        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4151        let mut r = _mm256_undefined_si256();
4152        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4153        assert_eq_m256i(r, a);
4154    }
4155
4156    #[simd_test(enable = "avx")]
4157    unsafe fn test_mm256_loadu_si256() {
4158        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4159        let p = ptr::addr_of!(a);
4160        let r = _mm256_loadu_si256(black_box(p));
4161        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4162        assert_eq_m256i(r, e);
4163    }
4164
4165    #[simd_test(enable = "avx")]
4166    unsafe fn test_mm256_storeu_si256() {
4167        let a = _mm256_set1_epi8(9);
4168        let mut r = _mm256_undefined_si256();
4169        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4170        assert_eq_m256i(r, a);
4171    }
4172
4173    #[simd_test(enable = "avx")]
4174    unsafe fn test_mm256_maskload_pd() {
4175        let a = &[1.0f64, 2., 3., 4.];
4176        let p = a.as_ptr();
4177        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4178        let r = _mm256_maskload_pd(black_box(p), mask);
4179        let e = _mm256_setr_pd(0., 2., 0., 4.);
4180        assert_eq_m256d(r, e);
4181    }
4182
4183    #[simd_test(enable = "avx")]
4184    unsafe fn test_mm256_maskstore_pd() {
4185        let mut r = _mm256_set1_pd(0.);
4186        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4187        let a = _mm256_setr_pd(1., 2., 3., 4.);
4188        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4189        let e = _mm256_setr_pd(0., 2., 0., 4.);
4190        assert_eq_m256d(r, e);
4191    }
4192
4193    #[simd_test(enable = "avx")]
4194    unsafe fn test_mm_maskload_pd() {
4195        let a = &[1.0f64, 2.];
4196        let p = a.as_ptr();
4197        let mask = _mm_setr_epi64x(0, !0);
4198        let r = _mm_maskload_pd(black_box(p), mask);
4199        let e = _mm_setr_pd(0., 2.);
4200        assert_eq_m128d(r, e);
4201    }
4202
4203    #[simd_test(enable = "avx")]
4204    unsafe fn test_mm_maskstore_pd() {
4205        let mut r = _mm_set1_pd(0.);
4206        let mask = _mm_setr_epi64x(0, !0);
4207        let a = _mm_setr_pd(1., 2.);
4208        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4209        let e = _mm_setr_pd(0., 2.);
4210        assert_eq_m128d(r, e);
4211    }
4212
4213    #[simd_test(enable = "avx")]
4214    unsafe fn test_mm256_maskload_ps() {
4215        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4216        let p = a.as_ptr();
4217        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4218        let r = _mm256_maskload_ps(black_box(p), mask);
4219        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4220        assert_eq_m256(r, e);
4221    }
4222
4223    #[simd_test(enable = "avx")]
4224    unsafe fn test_mm256_maskstore_ps() {
4225        let mut r = _mm256_set1_ps(0.);
4226        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4227        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4228        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4229        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4230        assert_eq_m256(r, e);
4231    }
4232
4233    #[simd_test(enable = "avx")]
4234    unsafe fn test_mm_maskload_ps() {
4235        let a = &[1.0f32, 2., 3., 4.];
4236        let p = a.as_ptr();
4237        let mask = _mm_setr_epi32(0, !0, 0, !0);
4238        let r = _mm_maskload_ps(black_box(p), mask);
4239        let e = _mm_setr_ps(0., 2., 0., 4.);
4240        assert_eq_m128(r, e);
4241    }
4242
4243    #[simd_test(enable = "avx")]
4244    unsafe fn test_mm_maskstore_ps() {
4245        let mut r = _mm_set1_ps(0.);
4246        let mask = _mm_setr_epi32(0, !0, 0, !0);
4247        let a = _mm_setr_ps(1., 2., 3., 4.);
4248        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4249        let e = _mm_setr_ps(0., 2., 0., 4.);
4250        assert_eq_m128(r, e);
4251    }
4252
4253    #[simd_test(enable = "avx")]
4254    unsafe fn test_mm256_movehdup_ps() {
4255        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4256        let r = _mm256_movehdup_ps(a);
4257        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4258        assert_eq_m256(r, e);
4259    }
4260
4261    #[simd_test(enable = "avx")]
4262    unsafe fn test_mm256_moveldup_ps() {
4263        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4264        let r = _mm256_moveldup_ps(a);
4265        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4266        assert_eq_m256(r, e);
4267    }
4268
4269    #[simd_test(enable = "avx")]
4270    unsafe fn test_mm256_movedup_pd() {
4271        let a = _mm256_setr_pd(1., 2., 3., 4.);
4272        let r = _mm256_movedup_pd(a);
4273        let e = _mm256_setr_pd(1., 1., 3., 3.);
4274        assert_eq_m256d(r, e);
4275    }
4276
4277    #[simd_test(enable = "avx")]
4278    unsafe fn test_mm256_lddqu_si256() {
4279        #[rustfmt::skip]
4280        let a = _mm256_setr_epi8(
4281            1, 2, 3, 4, 5, 6, 7, 8,
4282            9, 10, 11, 12, 13, 14, 15, 16,
4283            17, 18, 19, 20, 21, 22, 23, 24,
4284            25, 26, 27, 28, 29, 30, 31, 32,
4285        );
4286        let p = ptr::addr_of!(a);
4287        let r = _mm256_lddqu_si256(black_box(p));
4288        #[rustfmt::skip]
4289        let e = _mm256_setr_epi8(
4290            1, 2, 3, 4, 5, 6, 7, 8,
4291            9, 10, 11, 12, 13, 14, 15, 16,
4292            17, 18, 19, 20, 21, 22, 23, 24,
4293            25, 26, 27, 28, 29, 30, 31, 32,
4294        );
4295        assert_eq_m256i(r, e);
4296    }
4297
4298    #[simd_test(enable = "avx")]
4299    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4300    unsafe fn test_mm256_stream_si256() {
4301        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4302        let mut r = _mm256_undefined_si256();
4303        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4304        _mm_sfence();
4305        assert_eq_m256i(r, a);
4306    }
4307
4308    #[simd_test(enable = "avx")]
4309    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4310    unsafe fn test_mm256_stream_pd() {
4311        #[repr(align(32))]
4312        struct Memory {
4313            pub data: [f64; 4],
4314        }
4315        let a = _mm256_set1_pd(7.0);
4316        let mut mem = Memory { data: [-1.0; 4] };
4317
4318        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4319        _mm_sfence();
4320        for i in 0..4 {
4321            assert_eq!(mem.data[i], get_m256d(a, i));
4322        }
4323    }
4324
4325    #[simd_test(enable = "avx")]
4326    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4327    unsafe fn test_mm256_stream_ps() {
4328        #[repr(align(32))]
4329        struct Memory {
4330            pub data: [f32; 8],
4331        }
4332        let a = _mm256_set1_ps(7.0);
4333        let mut mem = Memory { data: [-1.0; 8] };
4334
4335        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4336        _mm_sfence();
4337        for i in 0..8 {
4338            assert_eq!(mem.data[i], get_m256(a, i));
4339        }
4340    }
4341
4342    #[simd_test(enable = "avx")]
4343    unsafe fn test_mm256_rcp_ps() {
4344        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4345        let r = _mm256_rcp_ps(a);
4346        #[rustfmt::skip]
4347        let e = _mm256_setr_ps(
4348            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4349            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4350        );
4351        let rel_err = 0.00048828125;
4352        for i in 0..8 {
4353            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4354        }
4355    }
4356
4357    #[simd_test(enable = "avx")]
4358    unsafe fn test_mm256_rsqrt_ps() {
4359        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4360        let r = _mm256_rsqrt_ps(a);
4361        #[rustfmt::skip]
4362        let e = _mm256_setr_ps(
4363            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4364            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4365        );
4366        let rel_err = 0.00048828125;
4367        for i in 0..8 {
4368            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4369        }
4370    }
4371
4372    #[simd_test(enable = "avx")]
4373    unsafe fn test_mm256_unpackhi_pd() {
4374        let a = _mm256_setr_pd(1., 2., 3., 4.);
4375        let b = _mm256_setr_pd(5., 6., 7., 8.);
4376        let r = _mm256_unpackhi_pd(a, b);
4377        let e = _mm256_setr_pd(2., 6., 4., 8.);
4378        assert_eq_m256d(r, e);
4379    }
4380
4381    #[simd_test(enable = "avx")]
4382    unsafe fn test_mm256_unpackhi_ps() {
4383        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4384        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4385        let r = _mm256_unpackhi_ps(a, b);
4386        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4387        assert_eq_m256(r, e);
4388    }
4389
4390    #[simd_test(enable = "avx")]
4391    unsafe fn test_mm256_unpacklo_pd() {
4392        let a = _mm256_setr_pd(1., 2., 3., 4.);
4393        let b = _mm256_setr_pd(5., 6., 7., 8.);
4394        let r = _mm256_unpacklo_pd(a, b);
4395        let e = _mm256_setr_pd(1., 5., 3., 7.);
4396        assert_eq_m256d(r, e);
4397    }
4398
4399    #[simd_test(enable = "avx")]
4400    unsafe fn test_mm256_unpacklo_ps() {
4401        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4402        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4403        let r = _mm256_unpacklo_ps(a, b);
4404        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4405        assert_eq_m256(r, e);
4406    }
4407
4408    #[simd_test(enable = "avx")]
4409    unsafe fn test_mm256_testz_si256() {
4410        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4411        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4412        let r = _mm256_testz_si256(a, b);
4413        assert_eq!(r, 0);
4414        let b = _mm256_set1_epi64x(0);
4415        let r = _mm256_testz_si256(a, b);
4416        assert_eq!(r, 1);
4417    }
4418
4419    #[simd_test(enable = "avx")]
4420    unsafe fn test_mm256_testc_si256() {
4421        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4422        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4423        let r = _mm256_testc_si256(a, b);
4424        assert_eq!(r, 0);
4425        let b = _mm256_set1_epi64x(0);
4426        let r = _mm256_testc_si256(a, b);
4427        assert_eq!(r, 1);
4428    }
4429
4430    #[simd_test(enable = "avx")]
4431    unsafe fn test_mm256_testnzc_si256() {
4432        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4433        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4434        let r = _mm256_testnzc_si256(a, b);
4435        assert_eq!(r, 1);
4436        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4437        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4438        let r = _mm256_testnzc_si256(a, b);
4439        assert_eq!(r, 0);
4440    }
4441
4442    #[simd_test(enable = "avx")]
4443    unsafe fn test_mm256_testz_pd() {
4444        let a = _mm256_setr_pd(1., 2., 3., 4.);
4445        let b = _mm256_setr_pd(5., 6., 7., 8.);
4446        let r = _mm256_testz_pd(a, b);
4447        assert_eq!(r, 1);
4448        let a = _mm256_set1_pd(-1.);
4449        let r = _mm256_testz_pd(a, a);
4450        assert_eq!(r, 0);
4451    }
4452
4453    #[simd_test(enable = "avx")]
4454    unsafe fn test_mm256_testc_pd() {
4455        let a = _mm256_setr_pd(1., 2., 3., 4.);
4456        let b = _mm256_setr_pd(5., 6., 7., 8.);
4457        let r = _mm256_testc_pd(a, b);
4458        assert_eq!(r, 1);
4459        let a = _mm256_set1_pd(1.);
4460        let b = _mm256_set1_pd(-1.);
4461        let r = _mm256_testc_pd(a, b);
4462        assert_eq!(r, 0);
4463    }
4464
4465    #[simd_test(enable = "avx")]
4466    unsafe fn test_mm256_testnzc_pd() {
4467        let a = _mm256_setr_pd(1., 2., 3., 4.);
4468        let b = _mm256_setr_pd(5., 6., 7., 8.);
4469        let r = _mm256_testnzc_pd(a, b);
4470        assert_eq!(r, 0);
4471        let a = _mm256_setr_pd(1., -1., -1., -1.);
4472        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4473        let r = _mm256_testnzc_pd(a, b);
4474        assert_eq!(r, 1);
4475    }
4476
4477    #[simd_test(enable = "avx")]
4478    unsafe fn test_mm_testz_pd() {
4479        let a = _mm_setr_pd(1., 2.);
4480        let b = _mm_setr_pd(5., 6.);
4481        let r = _mm_testz_pd(a, b);
4482        assert_eq!(r, 1);
4483        let a = _mm_set1_pd(-1.);
4484        let r = _mm_testz_pd(a, a);
4485        assert_eq!(r, 0);
4486    }
4487
4488    #[simd_test(enable = "avx")]
4489    unsafe fn test_mm_testc_pd() {
4490        let a = _mm_setr_pd(1., 2.);
4491        let b = _mm_setr_pd(5., 6.);
4492        let r = _mm_testc_pd(a, b);
4493        assert_eq!(r, 1);
4494        let a = _mm_set1_pd(1.);
4495        let b = _mm_set1_pd(-1.);
4496        let r = _mm_testc_pd(a, b);
4497        assert_eq!(r, 0);
4498    }
4499
4500    #[simd_test(enable = "avx")]
4501    unsafe fn test_mm_testnzc_pd() {
4502        let a = _mm_setr_pd(1., 2.);
4503        let b = _mm_setr_pd(5., 6.);
4504        let r = _mm_testnzc_pd(a, b);
4505        assert_eq!(r, 0);
4506        let a = _mm_setr_pd(1., -1.);
4507        let b = _mm_setr_pd(-1., -1.);
4508        let r = _mm_testnzc_pd(a, b);
4509        assert_eq!(r, 1);
4510    }
4511
4512    #[simd_test(enable = "avx")]
4513    unsafe fn test_mm256_testz_ps() {
4514        let a = _mm256_set1_ps(1.);
4515        let r = _mm256_testz_ps(a, a);
4516        assert_eq!(r, 1);
4517        let a = _mm256_set1_ps(-1.);
4518        let r = _mm256_testz_ps(a, a);
4519        assert_eq!(r, 0);
4520    }
4521
4522    #[simd_test(enable = "avx")]
4523    unsafe fn test_mm256_testc_ps() {
4524        let a = _mm256_set1_ps(1.);
4525        let r = _mm256_testc_ps(a, a);
4526        assert_eq!(r, 1);
4527        let b = _mm256_set1_ps(-1.);
4528        let r = _mm256_testc_ps(a, b);
4529        assert_eq!(r, 0);
4530    }
4531
4532    #[simd_test(enable = "avx")]
4533    unsafe fn test_mm256_testnzc_ps() {
4534        let a = _mm256_set1_ps(1.);
4535        let r = _mm256_testnzc_ps(a, a);
4536        assert_eq!(r, 0);
4537        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4538        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4539        let r = _mm256_testnzc_ps(a, b);
4540        assert_eq!(r, 1);
4541    }
4542
4543    #[simd_test(enable = "avx")]
4544    unsafe fn test_mm_testz_ps() {
4545        let a = _mm_set1_ps(1.);
4546        let r = _mm_testz_ps(a, a);
4547        assert_eq!(r, 1);
4548        let a = _mm_set1_ps(-1.);
4549        let r = _mm_testz_ps(a, a);
4550        assert_eq!(r, 0);
4551    }
4552
4553    #[simd_test(enable = "avx")]
4554    unsafe fn test_mm_testc_ps() {
4555        let a = _mm_set1_ps(1.);
4556        let r = _mm_testc_ps(a, a);
4557        assert_eq!(r, 1);
4558        let b = _mm_set1_ps(-1.);
4559        let r = _mm_testc_ps(a, b);
4560        assert_eq!(r, 0);
4561    }
4562
4563    #[simd_test(enable = "avx")]
4564    unsafe fn test_mm_testnzc_ps() {
4565        let a = _mm_set1_ps(1.);
4566        let r = _mm_testnzc_ps(a, a);
4567        assert_eq!(r, 0);
4568        let a = _mm_setr_ps(1., -1., -1., -1.);
4569        let b = _mm_setr_ps(-1., -1., 1., 1.);
4570        let r = _mm_testnzc_ps(a, b);
4571        assert_eq!(r, 1);
4572    }
4573
4574    #[simd_test(enable = "avx")]
4575    unsafe fn test_mm256_movemask_pd() {
4576        let a = _mm256_setr_pd(1., -2., 3., -4.);
4577        let r = _mm256_movemask_pd(a);
4578        assert_eq!(r, 0xA);
4579    }
4580
4581    #[simd_test(enable = "avx")]
4582    unsafe fn test_mm256_movemask_ps() {
4583        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4584        let r = _mm256_movemask_ps(a);
4585        assert_eq!(r, 0xAA);
4586    }
4587
4588    #[simd_test(enable = "avx")]
4589    unsafe fn test_mm256_setzero_pd() {
4590        let r = _mm256_setzero_pd();
4591        assert_eq_m256d(r, _mm256_set1_pd(0.));
4592    }
4593
4594    #[simd_test(enable = "avx")]
4595    unsafe fn test_mm256_setzero_ps() {
4596        let r = _mm256_setzero_ps();
4597        assert_eq_m256(r, _mm256_set1_ps(0.));
4598    }
4599
4600    #[simd_test(enable = "avx")]
4601    unsafe fn test_mm256_setzero_si256() {
4602        let r = _mm256_setzero_si256();
4603        assert_eq_m256i(r, _mm256_set1_epi8(0));
4604    }
4605
4606    #[simd_test(enable = "avx")]
4607    unsafe fn test_mm256_set_pd() {
4608        let r = _mm256_set_pd(1., 2., 3., 4.);
4609        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4610    }
4611
4612    #[simd_test(enable = "avx")]
4613    unsafe fn test_mm256_set_ps() {
4614        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4615        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4616    }
4617
4618    #[simd_test(enable = "avx")]
4619    unsafe fn test_mm256_set_epi8() {
4620        #[rustfmt::skip]
4621        let r = _mm256_set_epi8(
4622            1, 2, 3, 4, 5, 6, 7, 8,
4623            9, 10, 11, 12, 13, 14, 15, 16,
4624            17, 18, 19, 20, 21, 22, 23, 24,
4625            25, 26, 27, 28, 29, 30, 31, 32,
4626        );
4627        #[rustfmt::skip]
4628        let e = _mm256_setr_epi8(
4629            32, 31, 30, 29, 28, 27, 26, 25,
4630            24, 23, 22, 21, 20, 19, 18, 17,
4631            16, 15, 14, 13, 12, 11, 10, 9,
4632            8, 7, 6, 5, 4, 3, 2, 1
4633        );
4634        assert_eq_m256i(r, e);
4635    }
4636
4637    #[simd_test(enable = "avx")]
4638    unsafe fn test_mm256_set_epi16() {
4639        #[rustfmt::skip]
4640        let r = _mm256_set_epi16(
4641            1, 2, 3, 4, 5, 6, 7, 8,
4642            9, 10, 11, 12, 13, 14, 15, 16,
4643        );
4644        #[rustfmt::skip]
4645        let e = _mm256_setr_epi16(
4646            16, 15, 14, 13, 12, 11, 10, 9, 8,
4647            7, 6, 5, 4, 3, 2, 1,
4648        );
4649        assert_eq_m256i(r, e);
4650    }
4651
4652    #[simd_test(enable = "avx")]
4653    unsafe fn test_mm256_set_epi32() {
4654        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4655        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4656    }
4657
4658    #[simd_test(enable = "avx")]
4659    unsafe fn test_mm256_set_epi64x() {
4660        let r = _mm256_set_epi64x(1, 2, 3, 4);
4661        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4662    }
4663
4664    #[simd_test(enable = "avx")]
4665    unsafe fn test_mm256_setr_pd() {
4666        let r = _mm256_setr_pd(1., 2., 3., 4.);
4667        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4668    }
4669
4670    #[simd_test(enable = "avx")]
4671    unsafe fn test_mm256_setr_ps() {
4672        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4673        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4674    }
4675
4676    #[simd_test(enable = "avx")]
4677    unsafe fn test_mm256_setr_epi8() {
4678        #[rustfmt::skip]
4679        let r = _mm256_setr_epi8(
4680            1, 2, 3, 4, 5, 6, 7, 8,
4681            9, 10, 11, 12, 13, 14, 15, 16,
4682            17, 18, 19, 20, 21, 22, 23, 24,
4683            25, 26, 27, 28, 29, 30, 31, 32,
4684        );
4685        #[rustfmt::skip]
4686        let e = _mm256_setr_epi8(
4687            1, 2, 3, 4, 5, 6, 7, 8,
4688            9, 10, 11, 12, 13, 14, 15, 16,
4689            17, 18, 19, 20, 21, 22, 23, 24,
4690            25, 26, 27, 28, 29, 30, 31, 32
4691        );
4692
4693        assert_eq_m256i(r, e);
4694    }
4695
4696    #[simd_test(enable = "avx")]
4697    unsafe fn test_mm256_setr_epi16() {
4698        #[rustfmt::skip]
4699        let r = _mm256_setr_epi16(
4700            1, 2, 3, 4, 5, 6, 7, 8,
4701            9, 10, 11, 12, 13, 14, 15, 16,
4702        );
4703        #[rustfmt::skip]
4704        let e = _mm256_setr_epi16(
4705            1, 2, 3, 4, 5, 6, 7, 8,
4706            9, 10, 11, 12, 13, 14, 15, 16,
4707        );
4708        assert_eq_m256i(r, e);
4709    }
4710
4711    #[simd_test(enable = "avx")]
4712    unsafe fn test_mm256_setr_epi32() {
4713        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4714        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4715    }
4716
4717    #[simd_test(enable = "avx")]
4718    unsafe fn test_mm256_setr_epi64x() {
4719        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4720        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4721    }
4722
4723    #[simd_test(enable = "avx")]
4724    unsafe fn test_mm256_set1_pd() {
4725        let r = _mm256_set1_pd(1.);
4726        assert_eq_m256d(r, _mm256_set1_pd(1.));
4727    }
4728
4729    #[simd_test(enable = "avx")]
4730    unsafe fn test_mm256_set1_ps() {
4731        let r = _mm256_set1_ps(1.);
4732        assert_eq_m256(r, _mm256_set1_ps(1.));
4733    }
4734
4735    #[simd_test(enable = "avx")]
4736    unsafe fn test_mm256_set1_epi8() {
4737        let r = _mm256_set1_epi8(1);
4738        assert_eq_m256i(r, _mm256_set1_epi8(1));
4739    }
4740
4741    #[simd_test(enable = "avx")]
4742    unsafe fn test_mm256_set1_epi16() {
4743        let r = _mm256_set1_epi16(1);
4744        assert_eq_m256i(r, _mm256_set1_epi16(1));
4745    }
4746
4747    #[simd_test(enable = "avx")]
4748    unsafe fn test_mm256_set1_epi32() {
4749        let r = _mm256_set1_epi32(1);
4750        assert_eq_m256i(r, _mm256_set1_epi32(1));
4751    }
4752
4753    #[simd_test(enable = "avx")]
4754    unsafe fn test_mm256_set1_epi64x() {
4755        let r = _mm256_set1_epi64x(1);
4756        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4757    }
4758
4759    #[simd_test(enable = "avx")]
4760    unsafe fn test_mm256_castpd_ps() {
4761        let a = _mm256_setr_pd(1., 2., 3., 4.);
4762        let r = _mm256_castpd_ps(a);
4763        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4764        assert_eq_m256(r, e);
4765    }
4766
4767    #[simd_test(enable = "avx")]
4768    unsafe fn test_mm256_castps_pd() {
4769        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4770        let r = _mm256_castps_pd(a);
4771        let e = _mm256_setr_pd(1., 2., 3., 4.);
4772        assert_eq_m256d(r, e);
4773    }
4774
4775    #[simd_test(enable = "avx")]
4776    unsafe fn test_mm256_castps_si256() {
4777        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4778        let r = _mm256_castps_si256(a);
4779        #[rustfmt::skip]
4780        let e = _mm256_setr_epi8(
4781            0, 0, -128, 63, 0, 0, 0, 64,
4782            0, 0, 64, 64, 0, 0, -128, 64,
4783            0, 0, -96, 64, 0, 0, -64, 64,
4784            0, 0, -32, 64, 0, 0, 0, 65,
4785        );
4786        assert_eq_m256i(r, e);
4787    }
4788
4789    #[simd_test(enable = "avx")]
4790    unsafe fn test_mm256_castsi256_ps() {
4791        #[rustfmt::skip]
4792        let a = _mm256_setr_epi8(
4793            0, 0, -128, 63, 0, 0, 0, 64,
4794            0, 0, 64, 64, 0, 0, -128, 64,
4795            0, 0, -96, 64, 0, 0, -64, 64,
4796            0, 0, -32, 64, 0, 0, 0, 65,
4797        );
4798        let r = _mm256_castsi256_ps(a);
4799        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4800        assert_eq_m256(r, e);
4801    }
4802
4803    #[simd_test(enable = "avx")]
4804    unsafe fn test_mm256_castpd_si256() {
4805        let a = _mm256_setr_pd(1., 2., 3., 4.);
4806        let r = _mm256_castpd_si256(a);
4807        assert_eq_m256d(transmute(r), a);
4808    }
4809
4810    #[simd_test(enable = "avx")]
4811    unsafe fn test_mm256_castsi256_pd() {
4812        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4813        let r = _mm256_castsi256_pd(a);
4814        assert_eq_m256d(r, transmute(a));
4815    }
4816
4817    #[simd_test(enable = "avx")]
4818    unsafe fn test_mm256_castps256_ps128() {
4819        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4820        let r = _mm256_castps256_ps128(a);
4821        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4822    }
4823
4824    #[simd_test(enable = "avx")]
4825    unsafe fn test_mm256_castpd256_pd128() {
4826        let a = _mm256_setr_pd(1., 2., 3., 4.);
4827        let r = _mm256_castpd256_pd128(a);
4828        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4829    }
4830
4831    #[simd_test(enable = "avx")]
4832    unsafe fn test_mm256_castsi256_si128() {
4833        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4834        let r = _mm256_castsi256_si128(a);
4835        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4836    }
4837
4838    #[simd_test(enable = "avx")]
4839    unsafe fn test_mm256_castps128_ps256() {
4840        let a = _mm_setr_ps(1., 2., 3., 4.);
4841        let r = _mm256_castps128_ps256(a);
4842        assert_eq_m128(_mm256_castps256_ps128(r), a);
4843    }
4844
4845    #[simd_test(enable = "avx")]
4846    unsafe fn test_mm256_castpd128_pd256() {
4847        let a = _mm_setr_pd(1., 2.);
4848        let r = _mm256_castpd128_pd256(a);
4849        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4850    }
4851
4852    #[simd_test(enable = "avx")]
4853    unsafe fn test_mm256_castsi128_si256() {
4854        let a = _mm_setr_epi32(1, 2, 3, 4);
4855        let r = _mm256_castsi128_si256(a);
4856        assert_eq_m128i(_mm256_castsi256_si128(r), a);
4857    }
4858
4859    #[simd_test(enable = "avx")]
4860    unsafe fn test_mm256_zextps128_ps256() {
4861        let a = _mm_setr_ps(1., 2., 3., 4.);
4862        let r = _mm256_zextps128_ps256(a);
4863        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4864        assert_eq_m256(r, e);
4865    }
4866
4867    #[simd_test(enable = "avx")]
4868    unsafe fn test_mm256_zextsi128_si256() {
4869        let a = _mm_setr_epi64x(1, 2);
4870        let r = _mm256_zextsi128_si256(a);
4871        let e = _mm256_setr_epi64x(1, 2, 0, 0);
4872        assert_eq_m256i(r, e);
4873    }
4874
4875    #[simd_test(enable = "avx")]
4876    unsafe fn test_mm256_zextpd128_pd256() {
4877        let a = _mm_setr_pd(1., 2.);
4878        let r = _mm256_zextpd128_pd256(a);
4879        let e = _mm256_setr_pd(1., 2., 0., 0.);
4880        assert_eq_m256d(r, e);
4881    }
4882
4883    #[simd_test(enable = "avx")]
4884    unsafe fn test_mm256_set_m128() {
4885        let hi = _mm_setr_ps(5., 6., 7., 8.);
4886        let lo = _mm_setr_ps(1., 2., 3., 4.);
4887        let r = _mm256_set_m128(hi, lo);
4888        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4889        assert_eq_m256(r, e);
4890    }
4891
4892    #[simd_test(enable = "avx")]
4893    unsafe fn test_mm256_set_m128d() {
4894        let hi = _mm_setr_pd(3., 4.);
4895        let lo = _mm_setr_pd(1., 2.);
4896        let r = _mm256_set_m128d(hi, lo);
4897        let e = _mm256_setr_pd(1., 2., 3., 4.);
4898        assert_eq_m256d(r, e);
4899    }
4900
4901    #[simd_test(enable = "avx")]
4902    unsafe fn test_mm256_set_m128i() {
4903        #[rustfmt::skip]
4904        let hi = _mm_setr_epi8(
4905            17, 18, 19, 20,
4906            21, 22, 23, 24,
4907            25, 26, 27, 28,
4908            29, 30, 31, 32,
4909        );
4910        #[rustfmt::skip]
4911        let lo = _mm_setr_epi8(
4912            1, 2, 3, 4,
4913            5, 6, 7, 8,
4914            9, 10, 11, 12,
4915            13, 14, 15, 16,
4916        );
4917        let r = _mm256_set_m128i(hi, lo);
4918        #[rustfmt::skip]
4919        let e = _mm256_setr_epi8(
4920            1, 2, 3, 4, 5, 6, 7, 8,
4921            9, 10, 11, 12, 13, 14, 15, 16,
4922            17, 18, 19, 20, 21, 22, 23, 24,
4923            25, 26, 27, 28, 29, 30, 31, 32,
4924        );
4925        assert_eq_m256i(r, e);
4926    }
4927
4928    #[simd_test(enable = "avx")]
4929    unsafe fn test_mm256_setr_m128() {
4930        let lo = _mm_setr_ps(1., 2., 3., 4.);
4931        let hi = _mm_setr_ps(5., 6., 7., 8.);
4932        let r = _mm256_setr_m128(lo, hi);
4933        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4934        assert_eq_m256(r, e);
4935    }
4936
4937    #[simd_test(enable = "avx")]
4938    unsafe fn test_mm256_setr_m128d() {
4939        let lo = _mm_setr_pd(1., 2.);
4940        let hi = _mm_setr_pd(3., 4.);
4941        let r = _mm256_setr_m128d(lo, hi);
4942        let e = _mm256_setr_pd(1., 2., 3., 4.);
4943        assert_eq_m256d(r, e);
4944    }
4945
4946    #[simd_test(enable = "avx")]
4947    unsafe fn test_mm256_setr_m128i() {
4948        #[rustfmt::skip]
4949        let lo = _mm_setr_epi8(
4950            1, 2, 3, 4,
4951            5, 6, 7, 8,
4952            9, 10, 11, 12,
4953            13, 14, 15, 16,
4954        );
4955        #[rustfmt::skip]
4956        let hi = _mm_setr_epi8(
4957            17, 18, 19, 20, 21, 22, 23, 24,
4958            25, 26, 27, 28, 29, 30, 31, 32,
4959        );
4960        let r = _mm256_setr_m128i(lo, hi);
4961        #[rustfmt::skip]
4962        let e = _mm256_setr_epi8(
4963            1, 2, 3, 4, 5, 6, 7, 8,
4964            9, 10, 11, 12, 13, 14, 15, 16,
4965            17, 18, 19, 20, 21, 22, 23, 24,
4966            25, 26, 27, 28, 29, 30, 31, 32,
4967        );
4968        assert_eq_m256i(r, e);
4969    }
4970
4971    #[simd_test(enable = "avx")]
4972    unsafe fn test_mm256_loadu2_m128() {
4973        let hi = &[5., 6., 7., 8.];
4974        let hiaddr = hi.as_ptr();
4975        let lo = &[1., 2., 3., 4.];
4976        let loaddr = lo.as_ptr();
4977        let r = _mm256_loadu2_m128(hiaddr, loaddr);
4978        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4979        assert_eq_m256(r, e);
4980    }
4981
4982    #[simd_test(enable = "avx")]
4983    unsafe fn test_mm256_loadu2_m128d() {
4984        let hi = &[3., 4.];
4985        let hiaddr = hi.as_ptr();
4986        let lo = &[1., 2.];
4987        let loaddr = lo.as_ptr();
4988        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4989        let e = _mm256_setr_pd(1., 2., 3., 4.);
4990        assert_eq_m256d(r, e);
4991    }
4992
4993    #[simd_test(enable = "avx")]
4994    unsafe fn test_mm256_loadu2_m128i() {
4995        #[rustfmt::skip]
4996        let hi = _mm_setr_epi8(
4997            17, 18, 19, 20, 21, 22, 23, 24,
4998            25, 26, 27, 28, 29, 30, 31, 32,
4999        );
5000        #[rustfmt::skip]
5001        let lo = _mm_setr_epi8(
5002            1, 2, 3, 4, 5, 6, 7, 8,
5003            9, 10, 11, 12, 13, 14, 15, 16,
5004        );
5005        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
5006        #[rustfmt::skip]
5007        let e = _mm256_setr_epi8(
5008            1, 2, 3, 4, 5, 6, 7, 8,
5009            9, 10, 11, 12, 13, 14, 15, 16,
5010            17, 18, 19, 20, 21, 22, 23, 24,
5011            25, 26, 27, 28, 29, 30, 31, 32,
5012        );
5013        assert_eq_m256i(r, e);
5014    }
5015
5016    #[simd_test(enable = "avx")]
5017    unsafe fn test_mm256_storeu2_m128() {
5018        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5019        let mut hi = _mm_undefined_ps();
5020        let mut lo = _mm_undefined_ps();
5021        _mm256_storeu2_m128(
5022            ptr::addr_of_mut!(hi) as *mut f32,
5023            ptr::addr_of_mut!(lo) as *mut f32,
5024            a,
5025        );
5026        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5027        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5028    }
5029
5030    #[simd_test(enable = "avx")]
5031    unsafe fn test_mm256_storeu2_m128d() {
5032        let a = _mm256_setr_pd(1., 2., 3., 4.);
5033        let mut hi = _mm_undefined_pd();
5034        let mut lo = _mm_undefined_pd();
5035        _mm256_storeu2_m128d(
5036            ptr::addr_of_mut!(hi) as *mut f64,
5037            ptr::addr_of_mut!(lo) as *mut f64,
5038            a,
5039        );
5040        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5041        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5042    }
5043
5044    #[simd_test(enable = "avx")]
5045    unsafe fn test_mm256_storeu2_m128i() {
5046        #[rustfmt::skip]
5047        let a = _mm256_setr_epi8(
5048            1, 2, 3, 4, 5, 6, 7, 8,
5049            9, 10, 11, 12, 13, 14, 15, 16,
5050            17, 18, 19, 20, 21, 22, 23, 24,
5051            25, 26, 27, 28, 29, 30, 31, 32,
5052        );
5053        let mut hi = _mm_undefined_si128();
5054        let mut lo = _mm_undefined_si128();
5055        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5056        #[rustfmt::skip]
5057        let e_hi = _mm_setr_epi8(
5058            17, 18, 19, 20, 21, 22, 23, 24,
5059            25, 26, 27, 28, 29, 30, 31, 32
5060        );
5061        #[rustfmt::skip]
5062        let e_lo = _mm_setr_epi8(
5063            1, 2, 3, 4, 5, 6, 7, 8,
5064            9, 10, 11, 12, 13, 14, 15, 16
5065        );
5066
5067        assert_eq_m128i(hi, e_hi);
5068        assert_eq_m128i(lo, e_lo);
5069    }
5070
5071    #[simd_test(enable = "avx")]
5072    unsafe fn test_mm256_cvtss_f32() {
5073        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5074        let r = _mm256_cvtss_f32(a);
5075        assert_eq!(r, 1.);
5076    }
5077}