core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use core::hint::unreachable_unchecked;
22
23use crate::core_arch::{simd::*, x86::*};
24use crate::intrinsics::simd::*;
25
26#[cfg(test)]
27use stdarch_test::assert_instr;
28
29/// Computes the absolute values of packed 32-bit integers in `a`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
32#[inline]
33#[target_feature(enable = "avx2")]
34#[cfg_attr(test, assert_instr(vpabsd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37    unsafe {
38        let a = a.as_i32x8();
39        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
40        transmute(r)
41    }
42}
43
44/// Computes the absolute values of packed 16-bit integers in `a`.
45///
46/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
47#[inline]
48#[target_feature(enable = "avx2")]
49#[cfg_attr(test, assert_instr(vpabsw))]
50#[stable(feature = "simd_x86", since = "1.27.0")]
51pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
67    unsafe {
68        let a = a.as_i8x32();
69        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
70        transmute(r)
71    }
72}
73
74/// Adds packed 64-bit integers in `a` and `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
77#[inline]
78#[target_feature(enable = "avx2")]
79#[cfg_attr(test, assert_instr(vpaddq))]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
82    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
83}
84
85/// Adds packed 32-bit integers in `a` and `b`.
86///
87/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
88#[inline]
89#[target_feature(enable = "avx2")]
90#[cfg_attr(test, assert_instr(vpaddd))]
91#[stable(feature = "simd_x86", since = "1.27.0")]
92pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
93    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
94}
95
96/// Adds packed 16-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
99#[inline]
100#[target_feature(enable = "avx2")]
101#[cfg_attr(test, assert_instr(vpaddw))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
104    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
105}
106
107/// Adds packed 8-bit integers in `a` and `b`.
108///
109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
110#[inline]
111#[target_feature(enable = "avx2")]
112#[cfg_attr(test, assert_instr(vpaddb))]
113#[stable(feature = "simd_x86", since = "1.27.0")]
114pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
115    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
116}
117
118/// Adds packed 8-bit integers in `a` and `b` using saturation.
119///
120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
121#[inline]
122#[target_feature(enable = "avx2")]
123#[cfg_attr(test, assert_instr(vpaddsb))]
124#[stable(feature = "simd_x86", since = "1.27.0")]
125pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
126    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
127}
128
129/// Adds packed 16-bit integers in `a` and `b` using saturation.
130///
131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
132#[inline]
133#[target_feature(enable = "avx2")]
134#[cfg_attr(test, assert_instr(vpaddsw))]
135#[stable(feature = "simd_x86", since = "1.27.0")]
136pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
137    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
138}
139
140/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
141///
142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
143#[inline]
144#[target_feature(enable = "avx2")]
145#[cfg_attr(test, assert_instr(vpaddusb))]
146#[stable(feature = "simd_x86", since = "1.27.0")]
147pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
148    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
149}
150
151/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
152///
153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
154#[inline]
155#[target_feature(enable = "avx2")]
156#[cfg_attr(test, assert_instr(vpaddusw))]
157#[stable(feature = "simd_x86", since = "1.27.0")]
158pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
159    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
160}
161
162/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
163/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
164///
165/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
166#[inline]
167#[target_feature(enable = "avx2")]
168#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
169#[rustc_legacy_const_generics(2)]
170#[stable(feature = "simd_x86", since = "1.27.0")]
171pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
172    static_assert_uimm_bits!(IMM8, 8);
173    unsafe {
174        // If palignr is shifting the pair of vectors more than the size of two
175        // lanes, emit zero.
176        if IMM8 >= 32 {
177            return _mm256_setzero_si256();
178        }
179        // If palignr is shifting the pair of input vectors more than one lane,
180        // but less than two lanes, convert to shifting in zeroes.
181        let (a, b) = if IMM8 > 16 {
182            (_mm256_setzero_si256(), a)
183        } else {
184            (a, b)
185        };
186
187        let a = a.as_i8x32();
188        let b = b.as_i8x32();
189
190        if IMM8 == 16 {
191            return transmute(a);
192        }
193
194        let r: i8x32 = match IMM8 % 16 {
195            0 => simd_shuffle!(
196                b,
197                a,
198                [
199                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
200                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
201                ],
202            ),
203            1 => simd_shuffle!(
204                b,
205                a,
206                [
207                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
208                    23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
209                ],
210            ),
211            2 => simd_shuffle!(
212                b,
213                a,
214                [
215                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23,
216                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
217                ],
218            ),
219            3 => simd_shuffle!(
220                b,
221                a,
222                [
223                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23,
224                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
225                ],
226            ),
227            4 => simd_shuffle!(
228                b,
229                a,
230                [
231                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24,
232                    25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
233                ],
234            ),
235            5 => simd_shuffle!(
236                b,
237                a,
238                [
239                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25,
240                    26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
241                ],
242            ),
243            6 => simd_shuffle!(
244                b,
245                a,
246                [
247                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26,
248                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
249                ],
250            ),
251            7 => simd_shuffle!(
252                b,
253                a,
254                [
255                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26,
256                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
257                ],
258            ),
259            8 => simd_shuffle!(
260                b,
261                a,
262                [
263                    8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27,
264                    28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
265                ],
266            ),
267            9 => simd_shuffle!(
268                b,
269                a,
270                [
271                    9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28,
272                    29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
273                ],
274            ),
275            10 => simd_shuffle!(
276                b,
277                a,
278                [
279                    10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29,
280                    30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
281                ],
282            ),
283            11 => simd_shuffle!(
284                b,
285                a,
286                [
287                    11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30,
288                    31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
289                ],
290            ),
291            12 => simd_shuffle!(
292                b,
293                a,
294                [
295                    12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31,
296                    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
297                ],
298            ),
299            13 => simd_shuffle!(
300                b,
301                a,
302                [
303                    13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48,
304                    49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
305                ],
306            ),
307            14 => simd_shuffle!(
308                b,
309                a,
310                [
311                    14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49,
312                    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
313                ],
314            ),
315            15 => simd_shuffle!(
316                b,
317                a,
318                [
319                    15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50,
320                    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
321                ],
322            ),
323            _ => unreachable_unchecked(),
324        };
325        transmute(r)
326    }
327}
328
329/// Computes the bitwise AND of 256 bits (representing integer data)
330/// in `a` and `b`.
331///
332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
333#[inline]
334#[target_feature(enable = "avx2")]
335#[cfg_attr(test, assert_instr(vandps))]
336#[stable(feature = "simd_x86", since = "1.27.0")]
337pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
338    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
339}
340
341/// Computes the bitwise NOT of 256 bits (representing integer data)
342/// in `a` and then AND with `b`.
343///
344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
345#[inline]
346#[target_feature(enable = "avx2")]
347#[cfg_attr(test, assert_instr(vandnps))]
348#[stable(feature = "simd_x86", since = "1.27.0")]
349pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
350    unsafe {
351        let all_ones = _mm256_set1_epi8(-1);
352        transmute(simd_and(
353            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
354            b.as_i64x4(),
355        ))
356    }
357}
358
359/// Averages packed unsigned 16-bit integers in `a` and `b`.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
362#[inline]
363#[target_feature(enable = "avx2")]
364#[cfg_attr(test, assert_instr(vpavgw))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
367    unsafe {
368        let a = simd_cast::<_, u32x16>(a.as_u16x16());
369        let b = simd_cast::<_, u32x16>(b.as_u16x16());
370        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
371        transmute(simd_cast::<_, u16x16>(r))
372    }
373}
374
375/// Averages packed unsigned 8-bit integers in `a` and `b`.
376///
377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
378#[inline]
379#[target_feature(enable = "avx2")]
380#[cfg_attr(test, assert_instr(vpavgb))]
381#[stable(feature = "simd_x86", since = "1.27.0")]
382pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
383    unsafe {
384        let a = simd_cast::<_, u16x32>(a.as_u8x32());
385        let b = simd_cast::<_, u16x32>(b.as_u8x32());
386        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
387        transmute(simd_cast::<_, u8x32>(r))
388    }
389}
390
391/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
394#[inline]
395#[target_feature(enable = "avx2")]
396#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
397#[rustc_legacy_const_generics(2)]
398#[stable(feature = "simd_x86", since = "1.27.0")]
399pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
400    static_assert_uimm_bits!(IMM4, 4);
401    unsafe {
402        let a = a.as_i32x4();
403        let b = b.as_i32x4();
404        let r: i32x4 = simd_shuffle!(
405            a,
406            b,
407            [
408                [0, 4, 0, 4][IMM4 as usize & 0b11],
409                [1, 1, 5, 5][IMM4 as usize & 0b11],
410                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
411                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
412            ],
413        );
414        transmute(r)
415    }
416}
417
418/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
419///
420/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
421#[inline]
422#[target_feature(enable = "avx2")]
423#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
424#[rustc_legacy_const_generics(2)]
425#[stable(feature = "simd_x86", since = "1.27.0")]
426pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
427    static_assert_uimm_bits!(IMM8, 8);
428    unsafe {
429        let a = a.as_i32x8();
430        let b = b.as_i32x8();
431        let r: i32x8 = simd_shuffle!(
432            a,
433            b,
434            [
435                [0, 8, 0, 8][IMM8 as usize & 0b11],
436                [1, 1, 9, 9][IMM8 as usize & 0b11],
437                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
438                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
439                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
440                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
441                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
442                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
443            ],
444        );
445        transmute(r)
446    }
447}
448
449/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
455#[rustc_legacy_const_generics(2)]
456#[stable(feature = "simd_x86", since = "1.27.0")]
457pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
458    static_assert_uimm_bits!(IMM8, 8);
459    unsafe {
460        let a = a.as_i16x16();
461        let b = b.as_i16x16();
462
463        let r: i16x16 = simd_shuffle!(
464            a,
465            b,
466            [
467                [0, 16, 0, 16][IMM8 as usize & 0b11],
468                [1, 1, 17, 17][IMM8 as usize & 0b11],
469                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
470                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
471                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
472                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
473                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
474                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
475                [8, 24, 8, 24][IMM8 as usize & 0b11],
476                [9, 9, 25, 25][IMM8 as usize & 0b11],
477                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
478                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
479                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
480                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
481                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
482                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
483            ],
484        );
485        transmute(r)
486    }
487}
488
489/// Blends packed 8-bit integers from `a` and `b` using `mask`.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vpblendvb))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
497    unsafe {
498        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
499        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
500    }
501}
502
503/// Broadcasts the low packed 8-bit integer from `a` to all elements of
504/// the 128-bit returned value.
505///
506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
507#[inline]
508#[target_feature(enable = "avx2")]
509#[cfg_attr(test, assert_instr(vpbroadcastb))]
510#[stable(feature = "simd_x86", since = "1.27.0")]
511pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
512    unsafe {
513        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
514        transmute::<i8x16, _>(ret)
515    }
516}
517
518/// Broadcasts the low packed 8-bit integer from `a` to all elements of
519/// the 256-bit returned value.
520///
521/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
522#[inline]
523#[target_feature(enable = "avx2")]
524#[cfg_attr(test, assert_instr(vpbroadcastb))]
525#[stable(feature = "simd_x86", since = "1.27.0")]
526pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
527    unsafe {
528        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
529        transmute::<i8x32, _>(ret)
530    }
531}
532
533// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
534// often compiled to `vbroadcastss`.
535/// Broadcasts the low packed 32-bit integer from `a` to all elements of
536/// the 128-bit returned value.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
539#[inline]
540#[target_feature(enable = "avx2")]
541#[cfg_attr(test, assert_instr(vbroadcastss))]
542#[stable(feature = "simd_x86", since = "1.27.0")]
543pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
546        transmute::<i32x4, _>(ret)
547    }
548}
549
550// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
551// often compiled to `vbroadcastss`.
552/// Broadcasts the low packed 32-bit integer from `a` to all elements of
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[cfg_attr(test, assert_instr(vbroadcastss))]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
561    unsafe {
562        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
563        transmute::<i32x8, _>(ret)
564    }
565}
566
567/// Broadcasts the low packed 64-bit integer from `a` to all elements of
568/// the 128-bit returned value.
569///
570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
571#[inline]
572#[target_feature(enable = "avx2")]
573// Emits `vmovddup` instead of `vpbroadcastq`
574// See https://github.com/rust-lang/stdarch/issues/791
575#[cfg_attr(test, assert_instr(vmovddup))]
576#[stable(feature = "simd_x86", since = "1.27.0")]
577pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
578    unsafe {
579        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
580        transmute::<i64x2, _>(ret)
581    }
582}
583
584/// Broadcasts the low packed 64-bit integer from `a` to all elements of
585/// the 256-bit returned value.
586///
587/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
588#[inline]
589#[target_feature(enable = "avx2")]
590#[cfg_attr(test, assert_instr(vbroadcastsd))]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
593    unsafe {
594        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
595        transmute::<i64x4, _>(ret)
596    }
597}
598
599/// Broadcasts the low double-precision (64-bit) floating-point element
600/// from `a` to all elements of the 128-bit returned value.
601///
602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
603#[inline]
604#[target_feature(enable = "avx2")]
605#[cfg_attr(test, assert_instr(vmovddup))]
606#[stable(feature = "simd_x86", since = "1.27.0")]
607pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
608    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
609}
610
611/// Broadcasts the low double-precision (64-bit) floating-point element
612/// from `a` to all elements of the 256-bit returned value.
613///
614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
615#[inline]
616#[target_feature(enable = "avx2")]
617#[cfg_attr(test, assert_instr(vbroadcastsd))]
618#[stable(feature = "simd_x86", since = "1.27.0")]
619pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
620    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
621}
622
623/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
624/// the 256-bit returned value.
625///
626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
627#[inline]
628#[target_feature(enable = "avx2")]
629#[stable(feature = "simd_x86_updates", since = "1.82.0")]
630pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
631    unsafe {
632        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
633        transmute::<i64x4, _>(ret)
634    }
635}
636
637// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
638// `vbroadcastf128`.
639/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
640/// the 256-bit returned value.
641///
642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
643#[inline]
644#[target_feature(enable = "avx2")]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
647    unsafe {
648        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
649        transmute::<i64x4, _>(ret)
650    }
651}
652
653/// Broadcasts the low single-precision (32-bit) floating-point element
654/// from `a` to all elements of the 128-bit returned value.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vbroadcastss))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
662    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
663}
664
665/// Broadcasts the low single-precision (32-bit) floating-point element
666/// from `a` to all elements of the 256-bit returned value.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vbroadcastss))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
674    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
675}
676
677/// Broadcasts the low packed 16-bit integer from a to all elements of
678/// the 128-bit returned value
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpbroadcastw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
686    unsafe {
687        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
688        transmute::<i16x8, _>(ret)
689    }
690}
691
692/// Broadcasts the low packed 16-bit integer from a to all elements of
693/// the 256-bit returned value
694///
695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
696#[inline]
697#[target_feature(enable = "avx2")]
698#[cfg_attr(test, assert_instr(vpbroadcastw))]
699#[stable(feature = "simd_x86", since = "1.27.0")]
700pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
701    unsafe {
702        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
703        transmute::<i16x16, _>(ret)
704    }
705}
706
707/// Compares packed 64-bit integers in `a` and `b` for equality.
708///
709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
710#[inline]
711#[target_feature(enable = "avx2")]
712#[cfg_attr(test, assert_instr(vpcmpeqq))]
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
715    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
716}
717
718/// Compares packed 32-bit integers in `a` and `b` for equality.
719///
720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
721#[inline]
722#[target_feature(enable = "avx2")]
723#[cfg_attr(test, assert_instr(vpcmpeqd))]
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
726    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
727}
728
729/// Compares packed 16-bit integers in `a` and `b` for equality.
730///
731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
732#[inline]
733#[target_feature(enable = "avx2")]
734#[cfg_attr(test, assert_instr(vpcmpeqw))]
735#[stable(feature = "simd_x86", since = "1.27.0")]
736pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
737    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
738}
739
740/// Compares packed 8-bit integers in `a` and `b` for equality.
741///
742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
743#[inline]
744#[target_feature(enable = "avx2")]
745#[cfg_attr(test, assert_instr(vpcmpeqb))]
746#[stable(feature = "simd_x86", since = "1.27.0")]
747pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
748    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
749}
750
751/// Compares packed 64-bit integers in `a` and `b` for greater-than.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
754#[inline]
755#[target_feature(enable = "avx2")]
756#[cfg_attr(test, assert_instr(vpcmpgtq))]
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
759    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
760}
761
762/// Compares packed 32-bit integers in `a` and `b` for greater-than.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpcmpgtd))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
770    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
771}
772
773/// Compares packed 16-bit integers in `a` and `b` for greater-than.
774///
775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
776#[inline]
777#[target_feature(enable = "avx2")]
778#[cfg_attr(test, assert_instr(vpcmpgtw))]
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
781    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
782}
783
784/// Compares packed 8-bit integers in `a` and `b` for greater-than.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
787#[inline]
788#[target_feature(enable = "avx2")]
789#[cfg_attr(test, assert_instr(vpcmpgtb))]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
792    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
793}
794
795/// Sign-extend 16-bit integers to 32-bit integers.
796///
797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
798#[inline]
799#[target_feature(enable = "avx2")]
800#[cfg_attr(test, assert_instr(vpmovsxwd))]
801#[stable(feature = "simd_x86", since = "1.27.0")]
802pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
803    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
804}
805
806/// Sign-extend 16-bit integers to 64-bit integers.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
809#[inline]
810#[target_feature(enable = "avx2")]
811#[cfg_attr(test, assert_instr(vpmovsxwq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
814    unsafe {
815        let a = a.as_i16x8();
816        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
817        transmute::<i64x4, _>(simd_cast(v64))
818    }
819}
820
821/// Sign-extend 32-bit integers to 64-bit integers.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
824#[inline]
825#[target_feature(enable = "avx2")]
826#[cfg_attr(test, assert_instr(vpmovsxdq))]
827#[stable(feature = "simd_x86", since = "1.27.0")]
828pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
829    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
830}
831
832/// Sign-extend 8-bit integers to 16-bit integers.
833///
834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
835#[inline]
836#[target_feature(enable = "avx2")]
837#[cfg_attr(test, assert_instr(vpmovsxbw))]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
840    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
841}
842
843/// Sign-extend 8-bit integers to 32-bit integers.
844///
845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
846#[inline]
847#[target_feature(enable = "avx2")]
848#[cfg_attr(test, assert_instr(vpmovsxbd))]
849#[stable(feature = "simd_x86", since = "1.27.0")]
850pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
851    unsafe {
852        let a = a.as_i8x16();
853        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
854        transmute::<i32x8, _>(simd_cast(v64))
855    }
856}
857
858/// Sign-extend 8-bit integers to 64-bit integers.
859///
860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
861#[inline]
862#[target_feature(enable = "avx2")]
863#[cfg_attr(test, assert_instr(vpmovsxbq))]
864#[stable(feature = "simd_x86", since = "1.27.0")]
865pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
866    unsafe {
867        let a = a.as_i8x16();
868        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
869        transmute::<i64x4, _>(simd_cast(v32))
870    }
871}
872
873/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
874/// integers, and stores the results in `dst`.
875///
876/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
877#[inline]
878#[target_feature(enable = "avx2")]
879#[cfg_attr(test, assert_instr(vpmovzxwd))]
880#[stable(feature = "simd_x86", since = "1.27.0")]
881pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
882    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
883}
884
885/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
886/// integers. The upper four elements of `a` are unused.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vpmovzxwq))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
894    unsafe {
895        let a = a.as_u16x8();
896        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
897        transmute::<i64x4, _>(simd_cast(v64))
898    }
899}
900
901/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
904#[inline]
905#[target_feature(enable = "avx2")]
906#[cfg_attr(test, assert_instr(vpmovzxdq))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
909    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
910}
911
912/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
913///
914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
915#[inline]
916#[target_feature(enable = "avx2")]
917#[cfg_attr(test, assert_instr(vpmovzxbw))]
918#[stable(feature = "simd_x86", since = "1.27.0")]
919pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
920    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
921}
922
923/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
924/// integers. The upper eight elements of `a` are unused.
925///
926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
927#[inline]
928#[target_feature(enable = "avx2")]
929#[cfg_attr(test, assert_instr(vpmovzxbd))]
930#[stable(feature = "simd_x86", since = "1.27.0")]
931pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
932    unsafe {
933        let a = a.as_u8x16();
934        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
935        transmute::<i32x8, _>(simd_cast(v64))
936    }
937}
938
939/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
940/// integers. The upper twelve elements of `a` are unused.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
943#[inline]
944#[target_feature(enable = "avx2")]
945#[cfg_attr(test, assert_instr(vpmovzxbq))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
948    unsafe {
949        let a = a.as_u8x16();
950        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
951        transmute::<i64x4, _>(simd_cast(v32))
952    }
953}
954
955/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
956///
957/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
958#[inline]
959#[target_feature(enable = "avx2")]
960#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
961#[rustc_legacy_const_generics(1)]
962#[stable(feature = "simd_x86", since = "1.27.0")]
963pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
964    static_assert_uimm_bits!(IMM1, 1);
965    unsafe {
966        let a = a.as_i64x4();
967        let b = i64x4::ZERO;
968        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
969        transmute(dst)
970    }
971}
972
973/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
974///
975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
976#[inline]
977#[target_feature(enable = "avx2")]
978#[cfg_attr(test, assert_instr(vphaddw))]
979#[stable(feature = "simd_x86", since = "1.27.0")]
980pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
981    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
982}
983
984/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
985///
986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
987#[inline]
988#[target_feature(enable = "avx2")]
989#[cfg_attr(test, assert_instr(vphaddd))]
990#[stable(feature = "simd_x86", since = "1.27.0")]
991pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
992    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
993}
994
995/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
996/// using saturation.
997///
998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
999#[inline]
1000#[target_feature(enable = "avx2")]
1001#[cfg_attr(test, assert_instr(vphaddsw))]
1002#[stable(feature = "simd_x86", since = "1.27.0")]
1003pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1004    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
1005}
1006
1007/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1008///
1009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1010#[inline]
1011#[target_feature(enable = "avx2")]
1012#[cfg_attr(test, assert_instr(vphsubw))]
1013#[stable(feature = "simd_x86", since = "1.27.0")]
1014pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1015    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
1016}
1017
1018/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1019///
1020/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1021#[inline]
1022#[target_feature(enable = "avx2")]
1023#[cfg_attr(test, assert_instr(vphsubd))]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1026    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
1027}
1028
1029/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1030/// using saturation.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1033#[inline]
1034#[target_feature(enable = "avx2")]
1035#[cfg_attr(test, assert_instr(vphsubsw))]
1036#[stable(feature = "simd_x86", since = "1.27.0")]
1037pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1038    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1039}
1040
1041/// Returns values from `slice` at offsets determined by `offsets * scale`,
1042/// where
1043/// `scale` should be 1, 2, 4 or 8.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1046#[inline]
1047#[target_feature(enable = "avx2")]
1048#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1049#[rustc_legacy_const_generics(2)]
1050#[stable(feature = "simd_x86", since = "1.27.0")]
1051pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1052    slice: *const i32,
1053    offsets: __m128i,
1054) -> __m128i {
1055    static_assert_imm8_scale!(SCALE);
1056    let zero = i32x4::ZERO;
1057    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1058    let offsets = offsets.as_i32x4();
1059    let slice = slice as *const i8;
1060    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1061    transmute(r)
1062}
1063
1064/// Returns values from `slice` at offsets determined by `offsets * scale`,
1065/// where
1066/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1067/// that position instead.
1068///
1069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1070#[inline]
1071#[target_feature(enable = "avx2")]
1072#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1073#[rustc_legacy_const_generics(4)]
1074#[stable(feature = "simd_x86", since = "1.27.0")]
1075pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1076    src: __m128i,
1077    slice: *const i32,
1078    offsets: __m128i,
1079    mask: __m128i,
1080) -> __m128i {
1081    static_assert_imm8_scale!(SCALE);
1082    let src = src.as_i32x4();
1083    let mask = mask.as_i32x4();
1084    let offsets = offsets.as_i32x4();
1085    let slice = slice as *const i8;
1086    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1087    transmute(r)
1088}
1089
1090/// Returns values from `slice` at offsets determined by `offsets * scale`,
1091/// where
1092/// `scale` should be 1, 2, 4 or 8.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1095#[inline]
1096#[target_feature(enable = "avx2")]
1097#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1098#[rustc_legacy_const_generics(2)]
1099#[stable(feature = "simd_x86", since = "1.27.0")]
1100pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1101    slice: *const i32,
1102    offsets: __m256i,
1103) -> __m256i {
1104    static_assert_imm8_scale!(SCALE);
1105    let zero = i32x8::ZERO;
1106    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1107    let offsets = offsets.as_i32x8();
1108    let slice = slice as *const i8;
1109    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1110    transmute(r)
1111}
1112
1113/// Returns values from `slice` at offsets determined by `offsets * scale`,
1114/// where
1115/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1116/// that position instead.
1117///
1118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1119#[inline]
1120#[target_feature(enable = "avx2")]
1121#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1122#[rustc_legacy_const_generics(4)]
1123#[stable(feature = "simd_x86", since = "1.27.0")]
1124pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1125    src: __m256i,
1126    slice: *const i32,
1127    offsets: __m256i,
1128    mask: __m256i,
1129) -> __m256i {
1130    static_assert_imm8_scale!(SCALE);
1131    let src = src.as_i32x8();
1132    let mask = mask.as_i32x8();
1133    let offsets = offsets.as_i32x8();
1134    let slice = slice as *const i8;
1135    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1136    transmute(r)
1137}
1138
1139/// Returns values from `slice` at offsets determined by `offsets * scale`,
1140/// where
1141/// `scale` should be 1, 2, 4 or 8.
1142///
1143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1144#[inline]
1145#[target_feature(enable = "avx2")]
1146#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1147#[rustc_legacy_const_generics(2)]
1148#[stable(feature = "simd_x86", since = "1.27.0")]
1149pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1150    static_assert_imm8_scale!(SCALE);
1151    let zero = _mm_setzero_ps();
1152    let neg_one = _mm_set1_ps(-1.0);
1153    let offsets = offsets.as_i32x4();
1154    let slice = slice as *const i8;
1155    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1156}
1157
1158/// Returns values from `slice` at offsets determined by `offsets * scale`,
1159/// where
1160/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1161/// that position instead.
1162///
1163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1164#[inline]
1165#[target_feature(enable = "avx2")]
1166#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1167#[rustc_legacy_const_generics(4)]
1168#[stable(feature = "simd_x86", since = "1.27.0")]
1169pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1170    src: __m128,
1171    slice: *const f32,
1172    offsets: __m128i,
1173    mask: __m128,
1174) -> __m128 {
1175    static_assert_imm8_scale!(SCALE);
1176    let offsets = offsets.as_i32x4();
1177    let slice = slice as *const i8;
1178    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1179}
1180
1181/// Returns values from `slice` at offsets determined by `offsets * scale`,
1182/// where
1183/// `scale` should be 1, 2, 4 or 8.
1184///
1185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1186#[inline]
1187#[target_feature(enable = "avx2")]
1188#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1189#[rustc_legacy_const_generics(2)]
1190#[stable(feature = "simd_x86", since = "1.27.0")]
1191pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1192    static_assert_imm8_scale!(SCALE);
1193    let zero = _mm256_setzero_ps();
1194    let neg_one = _mm256_set1_ps(-1.0);
1195    let offsets = offsets.as_i32x8();
1196    let slice = slice as *const i8;
1197    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1198}
1199
1200/// Returns values from `slice` at offsets determined by `offsets * scale`,
1201/// where
1202/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1203/// that position instead.
1204///
1205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1206#[inline]
1207#[target_feature(enable = "avx2")]
1208#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1209#[rustc_legacy_const_generics(4)]
1210#[stable(feature = "simd_x86", since = "1.27.0")]
1211pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1212    src: __m256,
1213    slice: *const f32,
1214    offsets: __m256i,
1215    mask: __m256,
1216) -> __m256 {
1217    static_assert_imm8_scale!(SCALE);
1218    let offsets = offsets.as_i32x8();
1219    let slice = slice as *const i8;
1220    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1221}
1222
1223/// Returns values from `slice` at offsets determined by `offsets * scale`,
1224/// where
1225/// `scale` should be 1, 2, 4 or 8.
1226///
1227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1228#[inline]
1229#[target_feature(enable = "avx2")]
1230#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1231#[rustc_legacy_const_generics(2)]
1232#[stable(feature = "simd_x86", since = "1.27.0")]
1233pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1234    slice: *const i64,
1235    offsets: __m128i,
1236) -> __m128i {
1237    static_assert_imm8_scale!(SCALE);
1238    let zero = i64x2::ZERO;
1239    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1240    let offsets = offsets.as_i32x4();
1241    let slice = slice as *const i8;
1242    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1243    transmute(r)
1244}
1245
1246/// Returns values from `slice` at offsets determined by `offsets * scale`,
1247/// where
1248/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1249/// that position instead.
1250///
1251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1252#[inline]
1253#[target_feature(enable = "avx2")]
1254#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1255#[rustc_legacy_const_generics(4)]
1256#[stable(feature = "simd_x86", since = "1.27.0")]
1257pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1258    src: __m128i,
1259    slice: *const i64,
1260    offsets: __m128i,
1261    mask: __m128i,
1262) -> __m128i {
1263    static_assert_imm8_scale!(SCALE);
1264    let src = src.as_i64x2();
1265    let mask = mask.as_i64x2();
1266    let offsets = offsets.as_i32x4();
1267    let slice = slice as *const i8;
1268    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1269    transmute(r)
1270}
1271
1272/// Returns values from `slice` at offsets determined by `offsets * scale`,
1273/// where
1274/// `scale` should be 1, 2, 4 or 8.
1275///
1276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1277#[inline]
1278#[target_feature(enable = "avx2")]
1279#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1280#[rustc_legacy_const_generics(2)]
1281#[stable(feature = "simd_x86", since = "1.27.0")]
1282pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1283    slice: *const i64,
1284    offsets: __m128i,
1285) -> __m256i {
1286    static_assert_imm8_scale!(SCALE);
1287    let zero = i64x4::ZERO;
1288    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1289    let offsets = offsets.as_i32x4();
1290    let slice = slice as *const i8;
1291    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1292    transmute(r)
1293}
1294
1295/// Returns values from `slice` at offsets determined by `offsets * scale`,
1296/// where
1297/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1298/// that position instead.
1299///
1300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1301#[inline]
1302#[target_feature(enable = "avx2")]
1303#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1304#[rustc_legacy_const_generics(4)]
1305#[stable(feature = "simd_x86", since = "1.27.0")]
1306pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1307    src: __m256i,
1308    slice: *const i64,
1309    offsets: __m128i,
1310    mask: __m256i,
1311) -> __m256i {
1312    static_assert_imm8_scale!(SCALE);
1313    let src = src.as_i64x4();
1314    let mask = mask.as_i64x4();
1315    let offsets = offsets.as_i32x4();
1316    let slice = slice as *const i8;
1317    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1318    transmute(r)
1319}
1320
1321/// Returns values from `slice` at offsets determined by `offsets * scale`,
1322/// where
1323/// `scale` should be 1, 2, 4 or 8.
1324///
1325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1326#[inline]
1327#[target_feature(enable = "avx2")]
1328#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1329#[rustc_legacy_const_generics(2)]
1330#[stable(feature = "simd_x86", since = "1.27.0")]
1331pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1332    static_assert_imm8_scale!(SCALE);
1333    let zero = _mm_setzero_pd();
1334    let neg_one = _mm_set1_pd(-1.0);
1335    let offsets = offsets.as_i32x4();
1336    let slice = slice as *const i8;
1337    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1338}
1339
1340/// Returns values from `slice` at offsets determined by `offsets * scale`,
1341/// where
1342/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1343/// that position instead.
1344///
1345/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1346#[inline]
1347#[target_feature(enable = "avx2")]
1348#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1349#[rustc_legacy_const_generics(4)]
1350#[stable(feature = "simd_x86", since = "1.27.0")]
1351pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1352    src: __m128d,
1353    slice: *const f64,
1354    offsets: __m128i,
1355    mask: __m128d,
1356) -> __m128d {
1357    static_assert_imm8_scale!(SCALE);
1358    let offsets = offsets.as_i32x4();
1359    let slice = slice as *const i8;
1360    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1361}
1362
1363/// Returns values from `slice` at offsets determined by `offsets * scale`,
1364/// where
1365/// `scale` should be 1, 2, 4 or 8.
1366///
1367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1368#[inline]
1369#[target_feature(enable = "avx2")]
1370#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1371#[rustc_legacy_const_generics(2)]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1374    slice: *const f64,
1375    offsets: __m128i,
1376) -> __m256d {
1377    static_assert_imm8_scale!(SCALE);
1378    let zero = _mm256_setzero_pd();
1379    let neg_one = _mm256_set1_pd(-1.0);
1380    let offsets = offsets.as_i32x4();
1381    let slice = slice as *const i8;
1382    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1383}
1384
1385/// Returns values from `slice` at offsets determined by `offsets * scale`,
1386/// where
1387/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1388/// that position instead.
1389///
1390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1391#[inline]
1392#[target_feature(enable = "avx2")]
1393#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1394#[rustc_legacy_const_generics(4)]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1397    src: __m256d,
1398    slice: *const f64,
1399    offsets: __m128i,
1400    mask: __m256d,
1401) -> __m256d {
1402    static_assert_imm8_scale!(SCALE);
1403    let offsets = offsets.as_i32x4();
1404    let slice = slice as *const i8;
1405    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1406}
1407
1408/// Returns values from `slice` at offsets determined by `offsets * scale`,
1409/// where
1410/// `scale` should be 1, 2, 4 or 8.
1411///
1412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1413#[inline]
1414#[target_feature(enable = "avx2")]
1415#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1416#[rustc_legacy_const_generics(2)]
1417#[stable(feature = "simd_x86", since = "1.27.0")]
1418pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1419    slice: *const i32,
1420    offsets: __m128i,
1421) -> __m128i {
1422    static_assert_imm8_scale!(SCALE);
1423    let zero = i32x4::ZERO;
1424    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1425    let offsets = offsets.as_i64x2();
1426    let slice = slice as *const i8;
1427    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1428    transmute(r)
1429}
1430
1431/// Returns values from `slice` at offsets determined by `offsets * scale`,
1432/// where
1433/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1434/// that position instead.
1435///
1436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1437#[inline]
1438#[target_feature(enable = "avx2")]
1439#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1440#[rustc_legacy_const_generics(4)]
1441#[stable(feature = "simd_x86", since = "1.27.0")]
1442pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1443    src: __m128i,
1444    slice: *const i32,
1445    offsets: __m128i,
1446    mask: __m128i,
1447) -> __m128i {
1448    static_assert_imm8_scale!(SCALE);
1449    let src = src.as_i32x4();
1450    let mask = mask.as_i32x4();
1451    let offsets = offsets.as_i64x2();
1452    let slice = slice as *const i8;
1453    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1454    transmute(r)
1455}
1456
1457/// Returns values from `slice` at offsets determined by `offsets * scale`,
1458/// where
1459/// `scale` should be 1, 2, 4 or 8.
1460///
1461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1462#[inline]
1463#[target_feature(enable = "avx2")]
1464#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1465#[rustc_legacy_const_generics(2)]
1466#[stable(feature = "simd_x86", since = "1.27.0")]
1467pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1468    slice: *const i32,
1469    offsets: __m256i,
1470) -> __m128i {
1471    static_assert_imm8_scale!(SCALE);
1472    let zero = i32x4::ZERO;
1473    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1474    let offsets = offsets.as_i64x4();
1475    let slice = slice as *const i8;
1476    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1477    transmute(r)
1478}
1479
1480/// Returns values from `slice` at offsets determined by `offsets * scale`,
1481/// where
1482/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1483/// that position instead.
1484///
1485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1486#[inline]
1487#[target_feature(enable = "avx2")]
1488#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1489#[rustc_legacy_const_generics(4)]
1490#[stable(feature = "simd_x86", since = "1.27.0")]
1491pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1492    src: __m128i,
1493    slice: *const i32,
1494    offsets: __m256i,
1495    mask: __m128i,
1496) -> __m128i {
1497    static_assert_imm8_scale!(SCALE);
1498    let src = src.as_i32x4();
1499    let mask = mask.as_i32x4();
1500    let offsets = offsets.as_i64x4();
1501    let slice = slice as *const i8;
1502    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1503    transmute(r)
1504}
1505
1506/// Returns values from `slice` at offsets determined by `offsets * scale`,
1507/// where
1508/// `scale` should be 1, 2, 4 or 8.
1509///
1510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1511#[inline]
1512#[target_feature(enable = "avx2")]
1513#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1514#[rustc_legacy_const_generics(2)]
1515#[stable(feature = "simd_x86", since = "1.27.0")]
1516pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1517    static_assert_imm8_scale!(SCALE);
1518    let zero = _mm_setzero_ps();
1519    let neg_one = _mm_set1_ps(-1.0);
1520    let offsets = offsets.as_i64x2();
1521    let slice = slice as *const i8;
1522    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1523}
1524
1525/// Returns values from `slice` at offsets determined by `offsets * scale`,
1526/// where
1527/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1528/// that position instead.
1529///
1530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1531#[inline]
1532#[target_feature(enable = "avx2")]
1533#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1534#[rustc_legacy_const_generics(4)]
1535#[stable(feature = "simd_x86", since = "1.27.0")]
1536pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1537    src: __m128,
1538    slice: *const f32,
1539    offsets: __m128i,
1540    mask: __m128,
1541) -> __m128 {
1542    static_assert_imm8_scale!(SCALE);
1543    let offsets = offsets.as_i64x2();
1544    let slice = slice as *const i8;
1545    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1546}
1547
1548/// Returns values from `slice` at offsets determined by `offsets * scale`,
1549/// where
1550/// `scale` should be 1, 2, 4 or 8.
1551///
1552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1553#[inline]
1554#[target_feature(enable = "avx2")]
1555#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1556#[rustc_legacy_const_generics(2)]
1557#[stable(feature = "simd_x86", since = "1.27.0")]
1558pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1559    static_assert_imm8_scale!(SCALE);
1560    let zero = _mm_setzero_ps();
1561    let neg_one = _mm_set1_ps(-1.0);
1562    let offsets = offsets.as_i64x4();
1563    let slice = slice as *const i8;
1564    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1565}
1566
1567/// Returns values from `slice` at offsets determined by `offsets * scale`,
1568/// where
1569/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1570/// that position instead.
1571///
1572/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1573#[inline]
1574#[target_feature(enable = "avx2")]
1575#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1576#[rustc_legacy_const_generics(4)]
1577#[stable(feature = "simd_x86", since = "1.27.0")]
1578pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1579    src: __m128,
1580    slice: *const f32,
1581    offsets: __m256i,
1582    mask: __m128,
1583) -> __m128 {
1584    static_assert_imm8_scale!(SCALE);
1585    let offsets = offsets.as_i64x4();
1586    let slice = slice as *const i8;
1587    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1588}
1589
1590/// Returns values from `slice` at offsets determined by `offsets * scale`,
1591/// where
1592/// `scale` should be 1, 2, 4 or 8.
1593///
1594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1595#[inline]
1596#[target_feature(enable = "avx2")]
1597#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1598#[rustc_legacy_const_generics(2)]
1599#[stable(feature = "simd_x86", since = "1.27.0")]
1600pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1601    slice: *const i64,
1602    offsets: __m128i,
1603) -> __m128i {
1604    static_assert_imm8_scale!(SCALE);
1605    let zero = i64x2::ZERO;
1606    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1607    let slice = slice as *const i8;
1608    let offsets = offsets.as_i64x2();
1609    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1610    transmute(r)
1611}
1612
1613/// Returns values from `slice` at offsets determined by `offsets * scale`,
1614/// where
1615/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1616/// that position instead.
1617///
1618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1619#[inline]
1620#[target_feature(enable = "avx2")]
1621#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1622#[rustc_legacy_const_generics(4)]
1623#[stable(feature = "simd_x86", since = "1.27.0")]
1624pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1625    src: __m128i,
1626    slice: *const i64,
1627    offsets: __m128i,
1628    mask: __m128i,
1629) -> __m128i {
1630    static_assert_imm8_scale!(SCALE);
1631    let src = src.as_i64x2();
1632    let mask = mask.as_i64x2();
1633    let offsets = offsets.as_i64x2();
1634    let slice = slice as *const i8;
1635    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1636    transmute(r)
1637}
1638
1639/// Returns values from `slice` at offsets determined by `offsets * scale`,
1640/// where
1641/// `scale` should be 1, 2, 4 or 8.
1642///
1643/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1644#[inline]
1645#[target_feature(enable = "avx2")]
1646#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1647#[rustc_legacy_const_generics(2)]
1648#[stable(feature = "simd_x86", since = "1.27.0")]
1649pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1650    slice: *const i64,
1651    offsets: __m256i,
1652) -> __m256i {
1653    static_assert_imm8_scale!(SCALE);
1654    let zero = i64x4::ZERO;
1655    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1656    let slice = slice as *const i8;
1657    let offsets = offsets.as_i64x4();
1658    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1659    transmute(r)
1660}
1661
1662/// Returns values from `slice` at offsets determined by `offsets * scale`,
1663/// where
1664/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1665/// that position instead.
1666///
1667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1668#[inline]
1669#[target_feature(enable = "avx2")]
1670#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1671#[rustc_legacy_const_generics(4)]
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1674    src: __m256i,
1675    slice: *const i64,
1676    offsets: __m256i,
1677    mask: __m256i,
1678) -> __m256i {
1679    static_assert_imm8_scale!(SCALE);
1680    let src = src.as_i64x4();
1681    let mask = mask.as_i64x4();
1682    let offsets = offsets.as_i64x4();
1683    let slice = slice as *const i8;
1684    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1685    transmute(r)
1686}
1687
1688/// Returns values from `slice` at offsets determined by `offsets * scale`,
1689/// where
1690/// `scale` should be 1, 2, 4 or 8.
1691///
1692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1693#[inline]
1694#[target_feature(enable = "avx2")]
1695#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1696#[rustc_legacy_const_generics(2)]
1697#[stable(feature = "simd_x86", since = "1.27.0")]
1698pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1699    static_assert_imm8_scale!(SCALE);
1700    let zero = _mm_setzero_pd();
1701    let neg_one = _mm_set1_pd(-1.0);
1702    let slice = slice as *const i8;
1703    let offsets = offsets.as_i64x2();
1704    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1705}
1706
1707/// Returns values from `slice` at offsets determined by `offsets * scale`,
1708/// where
1709/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1710/// that position instead.
1711///
1712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1713#[inline]
1714#[target_feature(enable = "avx2")]
1715#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1716#[rustc_legacy_const_generics(4)]
1717#[stable(feature = "simd_x86", since = "1.27.0")]
1718pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1719    src: __m128d,
1720    slice: *const f64,
1721    offsets: __m128i,
1722    mask: __m128d,
1723) -> __m128d {
1724    static_assert_imm8_scale!(SCALE);
1725    let slice = slice as *const i8;
1726    let offsets = offsets.as_i64x2();
1727    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1728}
1729
1730/// Returns values from `slice` at offsets determined by `offsets * scale`,
1731/// where
1732/// `scale` should be 1, 2, 4 or 8.
1733///
1734/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1735#[inline]
1736#[target_feature(enable = "avx2")]
1737#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1738#[rustc_legacy_const_generics(2)]
1739#[stable(feature = "simd_x86", since = "1.27.0")]
1740pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1741    slice: *const f64,
1742    offsets: __m256i,
1743) -> __m256d {
1744    static_assert_imm8_scale!(SCALE);
1745    let zero = _mm256_setzero_pd();
1746    let neg_one = _mm256_set1_pd(-1.0);
1747    let slice = slice as *const i8;
1748    let offsets = offsets.as_i64x4();
1749    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1750}
1751
1752/// Returns values from `slice` at offsets determined by `offsets * scale`,
1753/// where
1754/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1755/// that position instead.
1756///
1757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1758#[inline]
1759#[target_feature(enable = "avx2")]
1760#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1761#[rustc_legacy_const_generics(4)]
1762#[stable(feature = "simd_x86", since = "1.27.0")]
1763pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1764    src: __m256d,
1765    slice: *const f64,
1766    offsets: __m256i,
1767    mask: __m256d,
1768) -> __m256d {
1769    static_assert_imm8_scale!(SCALE);
1770    let slice = slice as *const i8;
1771    let offsets = offsets.as_i64x4();
1772    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1773}
1774
1775/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1776/// location specified by `IMM1`.
1777///
1778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1779#[inline]
1780#[target_feature(enable = "avx2")]
1781#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1782#[rustc_legacy_const_generics(2)]
1783#[stable(feature = "simd_x86", since = "1.27.0")]
1784pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1785    static_assert_uimm_bits!(IMM1, 1);
1786    unsafe {
1787        let a = a.as_i64x4();
1788        let b = _mm256_castsi128_si256(b).as_i64x4();
1789        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1790        transmute(dst)
1791    }
1792}
1793
1794/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1795/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1796/// of intermediate 32-bit integers.
1797///
1798/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1799#[inline]
1800#[target_feature(enable = "avx2")]
1801#[cfg_attr(test, assert_instr(vpmaddwd))]
1802#[stable(feature = "simd_x86", since = "1.27.0")]
1803pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1804    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1805}
1806
1807/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1808/// corresponding signed 8-bit integer from `b`, producing intermediate
1809/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1810/// signed 16-bit integers
1811///
1812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1813#[inline]
1814#[target_feature(enable = "avx2")]
1815#[cfg_attr(test, assert_instr(vpmaddubsw))]
1816#[stable(feature = "simd_x86", since = "1.27.0")]
1817pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1818    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
1819}
1820
1821/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1822/// (elements are zeroed out when the highest bit is not set in the
1823/// corresponding element).
1824///
1825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1826#[inline]
1827#[target_feature(enable = "avx2")]
1828#[cfg_attr(test, assert_instr(vpmaskmovd))]
1829#[stable(feature = "simd_x86", since = "1.27.0")]
1830pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1831    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1832}
1833
1834/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1835/// (elements are zeroed out when the highest bit is not set in the
1836/// corresponding element).
1837///
1838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1839#[inline]
1840#[target_feature(enable = "avx2")]
1841#[cfg_attr(test, assert_instr(vpmaskmovd))]
1842#[stable(feature = "simd_x86", since = "1.27.0")]
1843pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1844    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1845}
1846
1847/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1848/// (elements are zeroed out when the highest bit is not set in the
1849/// corresponding element).
1850///
1851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1852#[inline]
1853#[target_feature(enable = "avx2")]
1854#[cfg_attr(test, assert_instr(vpmaskmovq))]
1855#[stable(feature = "simd_x86", since = "1.27.0")]
1856pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1857    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1858}
1859
1860/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1861/// (elements are zeroed out when the highest bit is not set in the
1862/// corresponding element).
1863///
1864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1865#[inline]
1866#[target_feature(enable = "avx2")]
1867#[cfg_attr(test, assert_instr(vpmaskmovq))]
1868#[stable(feature = "simd_x86", since = "1.27.0")]
1869pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1870    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1871}
1872
1873/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1874/// using `mask` (elements are not stored when the highest bit is not set
1875/// in the corresponding element).
1876///
1877/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1878#[inline]
1879#[target_feature(enable = "avx2")]
1880#[cfg_attr(test, assert_instr(vpmaskmovd))]
1881#[stable(feature = "simd_x86", since = "1.27.0")]
1882pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1883    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1884}
1885
1886/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1887/// using `mask` (elements are not stored when the highest bit is not set
1888/// in the corresponding element).
1889///
1890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1891#[inline]
1892#[target_feature(enable = "avx2")]
1893#[cfg_attr(test, assert_instr(vpmaskmovd))]
1894#[stable(feature = "simd_x86", since = "1.27.0")]
1895pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1896    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1897}
1898
1899/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1900/// using `mask` (elements are not stored when the highest bit is not set
1901/// in the corresponding element).
1902///
1903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1904#[inline]
1905#[target_feature(enable = "avx2")]
1906#[cfg_attr(test, assert_instr(vpmaskmovq))]
1907#[stable(feature = "simd_x86", since = "1.27.0")]
1908pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1909    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1910}
1911
1912/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1913/// using `mask` (elements are not stored when the highest bit is not set
1914/// in the corresponding element).
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1917#[inline]
1918#[target_feature(enable = "avx2")]
1919#[cfg_attr(test, assert_instr(vpmaskmovq))]
1920#[stable(feature = "simd_x86", since = "1.27.0")]
1921pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1922    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1923}
1924
1925/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1926/// maximum values.
1927///
1928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1929#[inline]
1930#[target_feature(enable = "avx2")]
1931#[cfg_attr(test, assert_instr(vpmaxsw))]
1932#[stable(feature = "simd_x86", since = "1.27.0")]
1933pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1934    unsafe {
1935        let a = a.as_i16x16();
1936        let b = b.as_i16x16();
1937        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1938    }
1939}
1940
1941/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1942/// maximum values.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1945#[inline]
1946#[target_feature(enable = "avx2")]
1947#[cfg_attr(test, assert_instr(vpmaxsd))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1950    unsafe {
1951        let a = a.as_i32x8();
1952        let b = b.as_i32x8();
1953        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1954    }
1955}
1956
1957/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1958/// maximum values.
1959///
1960/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1961#[inline]
1962#[target_feature(enable = "avx2")]
1963#[cfg_attr(test, assert_instr(vpmaxsb))]
1964#[stable(feature = "simd_x86", since = "1.27.0")]
1965pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1966    unsafe {
1967        let a = a.as_i8x32();
1968        let b = b.as_i8x32();
1969        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1970    }
1971}
1972
1973/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1974/// the packed maximum values.
1975///
1976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1977#[inline]
1978#[target_feature(enable = "avx2")]
1979#[cfg_attr(test, assert_instr(vpmaxuw))]
1980#[stable(feature = "simd_x86", since = "1.27.0")]
1981pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1982    unsafe {
1983        let a = a.as_u16x16();
1984        let b = b.as_u16x16();
1985        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1986    }
1987}
1988
1989/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1990/// the packed maximum values.
1991///
1992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1993#[inline]
1994#[target_feature(enable = "avx2")]
1995#[cfg_attr(test, assert_instr(vpmaxud))]
1996#[stable(feature = "simd_x86", since = "1.27.0")]
1997pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1998    unsafe {
1999        let a = a.as_u32x8();
2000        let b = b.as_u32x8();
2001        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
2002    }
2003}
2004
2005/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2006/// the packed maximum values.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2009#[inline]
2010#[target_feature(enable = "avx2")]
2011#[cfg_attr(test, assert_instr(vpmaxub))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2014    unsafe {
2015        let a = a.as_u8x32();
2016        let b = b.as_u8x32();
2017        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
2018    }
2019}
2020
2021/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2022/// minimum values.
2023///
2024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2025#[inline]
2026#[target_feature(enable = "avx2")]
2027#[cfg_attr(test, assert_instr(vpminsw))]
2028#[stable(feature = "simd_x86", since = "1.27.0")]
2029pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2030    unsafe {
2031        let a = a.as_i16x16();
2032        let b = b.as_i16x16();
2033        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2034    }
2035}
2036
2037/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2038/// minimum values.
2039///
2040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2041#[inline]
2042#[target_feature(enable = "avx2")]
2043#[cfg_attr(test, assert_instr(vpminsd))]
2044#[stable(feature = "simd_x86", since = "1.27.0")]
2045pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2046    unsafe {
2047        let a = a.as_i32x8();
2048        let b = b.as_i32x8();
2049        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2050    }
2051}
2052
2053/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2054/// minimum values.
2055///
2056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2057#[inline]
2058#[target_feature(enable = "avx2")]
2059#[cfg_attr(test, assert_instr(vpminsb))]
2060#[stable(feature = "simd_x86", since = "1.27.0")]
2061pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2062    unsafe {
2063        let a = a.as_i8x32();
2064        let b = b.as_i8x32();
2065        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2066    }
2067}
2068
2069/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2070/// the packed minimum values.
2071///
2072/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2073#[inline]
2074#[target_feature(enable = "avx2")]
2075#[cfg_attr(test, assert_instr(vpminuw))]
2076#[stable(feature = "simd_x86", since = "1.27.0")]
2077pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2078    unsafe {
2079        let a = a.as_u16x16();
2080        let b = b.as_u16x16();
2081        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2082    }
2083}
2084
2085/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2086/// the packed minimum values.
2087///
2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2089#[inline]
2090#[target_feature(enable = "avx2")]
2091#[cfg_attr(test, assert_instr(vpminud))]
2092#[stable(feature = "simd_x86", since = "1.27.0")]
2093pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2094    unsafe {
2095        let a = a.as_u32x8();
2096        let b = b.as_u32x8();
2097        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2098    }
2099}
2100
2101/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2102/// the packed minimum values.
2103///
2104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2105#[inline]
2106#[target_feature(enable = "avx2")]
2107#[cfg_attr(test, assert_instr(vpminub))]
2108#[stable(feature = "simd_x86", since = "1.27.0")]
2109pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2110    unsafe {
2111        let a = a.as_u8x32();
2112        let b = b.as_u8x32();
2113        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2114    }
2115}
2116
2117/// Creates mask from the most significant bit of each 8-bit element in `a`,
2118/// return the result.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2121#[inline]
2122#[target_feature(enable = "avx2")]
2123#[cfg_attr(test, assert_instr(vpmovmskb))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2126    unsafe {
2127        let z = i8x32::ZERO;
2128        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2129        simd_bitmask::<_, u32>(m) as i32
2130    }
2131}
2132
2133/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2134/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2135/// results in dst. Eight SADs are performed for each 128-bit lane using one
2136/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2137/// selected from `b` starting at on the offset specified in `imm8`. Eight
2138/// quadruplets are formed from sequential 8-bit integers selected from `a`
2139/// starting at the offset specified in `imm8`.
2140///
2141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2142#[inline]
2143#[target_feature(enable = "avx2")]
2144#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2145#[rustc_legacy_const_generics(2)]
2146#[stable(feature = "simd_x86", since = "1.27.0")]
2147pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2148    static_assert_uimm_bits!(IMM8, 8);
2149    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8)) }
2150}
2151
2152/// Multiplies the low 32-bit integers from each packed 64-bit element in
2153/// `a` and `b`
2154///
2155/// Returns the 64-bit results.
2156///
2157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2158#[inline]
2159#[target_feature(enable = "avx2")]
2160#[cfg_attr(test, assert_instr(vpmuldq))]
2161#[stable(feature = "simd_x86", since = "1.27.0")]
2162pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2163    unsafe {
2164        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2165        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2166        transmute(simd_mul(a, b))
2167    }
2168}
2169
2170/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2171/// element in `a` and `b`
2172///
2173/// Returns the unsigned 64-bit results.
2174///
2175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2176#[inline]
2177#[target_feature(enable = "avx2")]
2178#[cfg_attr(test, assert_instr(vpmuludq))]
2179#[stable(feature = "simd_x86", since = "1.27.0")]
2180pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2181    unsafe {
2182        let a = a.as_u64x4();
2183        let b = b.as_u64x4();
2184        let mask = u64x4::splat(u32::MAX.into());
2185        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2186    }
2187}
2188
2189/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2190/// intermediate 32-bit integers and returning the high 16 bits of the
2191/// intermediate integers.
2192///
2193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2194#[inline]
2195#[target_feature(enable = "avx2")]
2196#[cfg_attr(test, assert_instr(vpmulhw))]
2197#[stable(feature = "simd_x86", since = "1.27.0")]
2198pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2199    unsafe {
2200        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2201        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2202        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2203        transmute(simd_cast::<i32x16, i16x16>(r))
2204    }
2205}
2206
2207/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2208/// intermediate 32-bit integers and returning the high 16 bits of the
2209/// intermediate integers.
2210///
2211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2212#[inline]
2213#[target_feature(enable = "avx2")]
2214#[cfg_attr(test, assert_instr(vpmulhuw))]
2215#[stable(feature = "simd_x86", since = "1.27.0")]
2216pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2217    unsafe {
2218        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2219        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2220        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2221        transmute(simd_cast::<u32x16, u16x16>(r))
2222    }
2223}
2224
2225/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2226/// intermediate 32-bit integers, and returns the low 16 bits of the
2227/// intermediate integers
2228///
2229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2230#[inline]
2231#[target_feature(enable = "avx2")]
2232#[cfg_attr(test, assert_instr(vpmullw))]
2233#[stable(feature = "simd_x86", since = "1.27.0")]
2234pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2235    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2236}
2237
2238/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2239/// intermediate 64-bit integers, and returns the low 32 bits of the
2240/// intermediate integers
2241///
2242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2243#[inline]
2244#[target_feature(enable = "avx2")]
2245#[cfg_attr(test, assert_instr(vpmulld))]
2246#[stable(feature = "simd_x86", since = "1.27.0")]
2247pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2248    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2249}
2250
2251/// Multiplies packed 16-bit integers in `a` and `b`, producing
2252/// intermediate signed 32-bit integers. Truncate each intermediate
2253/// integer to the 18 most significant bits, round by adding 1, and
2254/// return bits `[16:1]`.
2255///
2256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2257#[inline]
2258#[target_feature(enable = "avx2")]
2259#[cfg_attr(test, assert_instr(vpmulhrsw))]
2260#[stable(feature = "simd_x86", since = "1.27.0")]
2261pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2262    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2263}
2264
2265/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2266/// and `b`
2267///
2268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2269#[inline]
2270#[target_feature(enable = "avx2")]
2271#[cfg_attr(test, assert_instr(vorps))]
2272#[stable(feature = "simd_x86", since = "1.27.0")]
2273pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2274    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2275}
2276
2277/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2278/// using signed saturation
2279///
2280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2281#[inline]
2282#[target_feature(enable = "avx2")]
2283#[cfg_attr(test, assert_instr(vpacksswb))]
2284#[stable(feature = "simd_x86", since = "1.27.0")]
2285pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2286    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2287}
2288
2289/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2290/// using signed saturation
2291///
2292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2293#[inline]
2294#[target_feature(enable = "avx2")]
2295#[cfg_attr(test, assert_instr(vpackssdw))]
2296#[stable(feature = "simd_x86", since = "1.27.0")]
2297pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2298    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2299}
2300
2301/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2302/// using unsigned saturation
2303///
2304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2305#[inline]
2306#[target_feature(enable = "avx2")]
2307#[cfg_attr(test, assert_instr(vpackuswb))]
2308#[stable(feature = "simd_x86", since = "1.27.0")]
2309pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2310    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2311}
2312
2313/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2314/// using unsigned saturation
2315///
2316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2317#[inline]
2318#[target_feature(enable = "avx2")]
2319#[cfg_attr(test, assert_instr(vpackusdw))]
2320#[stable(feature = "simd_x86", since = "1.27.0")]
2321pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2322    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2323}
2324
2325/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2326///
2327/// The last 3 bits of each integer of `b` are used as addresses into the 8
2328/// integers of `a`.
2329///
2330/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2331#[inline]
2332#[target_feature(enable = "avx2")]
2333#[cfg_attr(test, assert_instr(vpermps))]
2334#[stable(feature = "simd_x86", since = "1.27.0")]
2335pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2336    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2337}
2338
2339/// Permutes 64-bit integers from `a` using control mask `imm8`.
2340///
2341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2342#[inline]
2343#[target_feature(enable = "avx2")]
2344#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2345#[rustc_legacy_const_generics(1)]
2346#[stable(feature = "simd_x86", since = "1.27.0")]
2347pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2348    static_assert_uimm_bits!(IMM8, 8);
2349    unsafe {
2350        let zero = i64x4::ZERO;
2351        let r: i64x4 = simd_shuffle!(
2352            a.as_i64x4(),
2353            zero,
2354            [
2355                IMM8 as u32 & 0b11,
2356                (IMM8 as u32 >> 2) & 0b11,
2357                (IMM8 as u32 >> 4) & 0b11,
2358                (IMM8 as u32 >> 6) & 0b11,
2359            ],
2360        );
2361        transmute(r)
2362    }
2363}
2364
2365/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2366///
2367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2368#[inline]
2369#[target_feature(enable = "avx2")]
2370#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2371#[rustc_legacy_const_generics(2)]
2372#[stable(feature = "simd_x86", since = "1.27.0")]
2373pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2374    static_assert_uimm_bits!(IMM8, 8);
2375    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
2376}
2377
2378/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2379/// control in `imm8`.
2380///
2381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2382#[inline]
2383#[target_feature(enable = "avx2")]
2384#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2385#[rustc_legacy_const_generics(1)]
2386#[stable(feature = "simd_x86", since = "1.27.0")]
2387pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2388    static_assert_uimm_bits!(IMM8, 8);
2389    unsafe {
2390        simd_shuffle!(
2391            a,
2392            _mm256_undefined_pd(),
2393            [
2394                IMM8 as u32 & 0b11,
2395                (IMM8 as u32 >> 2) & 0b11,
2396                (IMM8 as u32 >> 4) & 0b11,
2397                (IMM8 as u32 >> 6) & 0b11,
2398            ],
2399        )
2400    }
2401}
2402
2403/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2404/// the corresponding 32-bit integer index in `idx`.
2405///
2406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2407#[inline]
2408#[target_feature(enable = "avx2")]
2409#[cfg_attr(test, assert_instr(vpermps))]
2410#[stable(feature = "simd_x86", since = "1.27.0")]
2411pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2412    unsafe { permps(a, idx.as_i32x8()) }
2413}
2414
2415/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2416/// and `b`, then horizontally sum each consecutive 8 differences to
2417/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2418/// integers in the low 16 bits of the 64-bit return value
2419///
2420/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2421#[inline]
2422#[target_feature(enable = "avx2")]
2423#[cfg_attr(test, assert_instr(vpsadbw))]
2424#[stable(feature = "simd_x86", since = "1.27.0")]
2425pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2426    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2427}
2428
2429/// Shuffles bytes from `a` according to the content of `b`.
2430///
2431/// For each of the 128-bit low and high halves of the vectors, the last
2432/// 4 bits of each byte of `b` are used as addresses into the respective
2433/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2434///
2435/// In addition, if the highest significant bit of a byte of `b` is set, the
2436/// respective destination byte is set to 0.
2437///
2438/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2439/// equivalent to:
2440///
2441/// ```
2442/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2443///     let mut r = [0; 32];
2444///     for i in 0..16 {
2445///         // if the most significant bit of b is set,
2446///         // then the destination byte is set to 0.
2447///         if b[i] & 0x80 == 0u8 {
2448///             r[i] = a[(b[i] % 16) as usize];
2449///         }
2450///         if b[i + 16] & 0x80 == 0u8 {
2451///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2452///         }
2453///     }
2454///     r
2455/// }
2456/// ```
2457///
2458/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2459#[inline]
2460#[target_feature(enable = "avx2")]
2461#[cfg_attr(test, assert_instr(vpshufb))]
2462#[stable(feature = "simd_x86", since = "1.27.0")]
2463pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2464    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2465}
2466
2467/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2468/// `imm8`.
2469///
2470/// ```rust
2471/// #[cfg(target_arch = "x86")]
2472/// use std::arch::x86::*;
2473/// #[cfg(target_arch = "x86_64")]
2474/// use std::arch::x86_64::*;
2475///
2476/// # fn main() {
2477/// #     if is_x86_feature_detected!("avx2") {
2478/// #         #[target_feature(enable = "avx2")]
2479/// #         unsafe fn worker() {
2480/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2481///
2482/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2483/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2484///
2485/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2486/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2487///
2488/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2489/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2490/// #         }
2491/// #         unsafe { worker(); }
2492/// #     }
2493/// # }
2494/// ```
2495///
2496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2497#[inline]
2498#[target_feature(enable = "avx2")]
2499#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2500#[rustc_legacy_const_generics(1)]
2501#[stable(feature = "simd_x86", since = "1.27.0")]
2502pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2503    static_assert_uimm_bits!(MASK, 8);
2504    unsafe {
2505        let r: i32x8 = simd_shuffle!(
2506            a.as_i32x8(),
2507            a.as_i32x8(),
2508            [
2509                MASK as u32 & 0b11,
2510                (MASK as u32 >> 2) & 0b11,
2511                (MASK as u32 >> 4) & 0b11,
2512                (MASK as u32 >> 6) & 0b11,
2513                (MASK as u32 & 0b11) + 4,
2514                ((MASK as u32 >> 2) & 0b11) + 4,
2515                ((MASK as u32 >> 4) & 0b11) + 4,
2516                ((MASK as u32 >> 6) & 0b11) + 4,
2517            ],
2518        );
2519        transmute(r)
2520    }
2521}
2522
2523/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2524/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2525/// to the output.
2526///
2527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2528#[inline]
2529#[target_feature(enable = "avx2")]
2530#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2531#[rustc_legacy_const_generics(1)]
2532#[stable(feature = "simd_x86", since = "1.27.0")]
2533pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2534    static_assert_uimm_bits!(IMM8, 8);
2535    unsafe {
2536        let a = a.as_i16x16();
2537        let r: i16x16 = simd_shuffle!(
2538            a,
2539            a,
2540            [
2541                0,
2542                1,
2543                2,
2544                3,
2545                4 + (IMM8 as u32 & 0b11),
2546                4 + ((IMM8 as u32 >> 2) & 0b11),
2547                4 + ((IMM8 as u32 >> 4) & 0b11),
2548                4 + ((IMM8 as u32 >> 6) & 0b11),
2549                8,
2550                9,
2551                10,
2552                11,
2553                12 + (IMM8 as u32 & 0b11),
2554                12 + ((IMM8 as u32 >> 2) & 0b11),
2555                12 + ((IMM8 as u32 >> 4) & 0b11),
2556                12 + ((IMM8 as u32 >> 6) & 0b11),
2557            ],
2558        );
2559        transmute(r)
2560    }
2561}
2562
2563/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2564/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2565/// to the output.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2568#[inline]
2569#[target_feature(enable = "avx2")]
2570#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2571#[rustc_legacy_const_generics(1)]
2572#[stable(feature = "simd_x86", since = "1.27.0")]
2573pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2574    static_assert_uimm_bits!(IMM8, 8);
2575    unsafe {
2576        let a = a.as_i16x16();
2577        let r: i16x16 = simd_shuffle!(
2578            a,
2579            a,
2580            [
2581                0 + (IMM8 as u32 & 0b11),
2582                0 + ((IMM8 as u32 >> 2) & 0b11),
2583                0 + ((IMM8 as u32 >> 4) & 0b11),
2584                0 + ((IMM8 as u32 >> 6) & 0b11),
2585                4,
2586                5,
2587                6,
2588                7,
2589                8 + (IMM8 as u32 & 0b11),
2590                8 + ((IMM8 as u32 >> 2) & 0b11),
2591                8 + ((IMM8 as u32 >> 4) & 0b11),
2592                8 + ((IMM8 as u32 >> 6) & 0b11),
2593                12,
2594                13,
2595                14,
2596                15,
2597            ],
2598        );
2599        transmute(r)
2600    }
2601}
2602
2603/// Negates packed 16-bit integers in `a` when the corresponding signed
2604/// 16-bit integer in `b` is negative, and returns the results.
2605/// Results are zeroed out when the corresponding element in `b` is zero.
2606///
2607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2608#[inline]
2609#[target_feature(enable = "avx2")]
2610#[cfg_attr(test, assert_instr(vpsignw))]
2611#[stable(feature = "simd_x86", since = "1.27.0")]
2612pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2613    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2614}
2615
2616/// Negates packed 32-bit integers in `a` when the corresponding signed
2617/// 32-bit integer in `b` is negative, and returns the results.
2618/// Results are zeroed out when the corresponding element in `b` is zero.
2619///
2620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2621#[inline]
2622#[target_feature(enable = "avx2")]
2623#[cfg_attr(test, assert_instr(vpsignd))]
2624#[stable(feature = "simd_x86", since = "1.27.0")]
2625pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2626    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2627}
2628
2629/// Negates packed 8-bit integers in `a` when the corresponding signed
2630/// 8-bit integer in `b` is negative, and returns the results.
2631/// Results are zeroed out when the corresponding element in `b` is zero.
2632///
2633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2634#[inline]
2635#[target_feature(enable = "avx2")]
2636#[cfg_attr(test, assert_instr(vpsignb))]
2637#[stable(feature = "simd_x86", since = "1.27.0")]
2638pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2639    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2640}
2641
2642/// Shifts packed 16-bit integers in `a` left by `count` while
2643/// shifting in zeros, and returns the result
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2646#[inline]
2647#[target_feature(enable = "avx2")]
2648#[cfg_attr(test, assert_instr(vpsllw))]
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2651    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2652}
2653
2654/// Shifts packed 32-bit integers in `a` left by `count` while
2655/// shifting in zeros, and returns the result
2656///
2657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2658#[inline]
2659#[target_feature(enable = "avx2")]
2660#[cfg_attr(test, assert_instr(vpslld))]
2661#[stable(feature = "simd_x86", since = "1.27.0")]
2662pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2663    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2664}
2665
2666/// Shifts packed 64-bit integers in `a` left by `count` while
2667/// shifting in zeros, and returns the result
2668///
2669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2670#[inline]
2671#[target_feature(enable = "avx2")]
2672#[cfg_attr(test, assert_instr(vpsllq))]
2673#[stable(feature = "simd_x86", since = "1.27.0")]
2674pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2675    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2676}
2677
2678/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2679/// shifting in zeros, return the results;
2680///
2681/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2682#[inline]
2683#[target_feature(enable = "avx2")]
2684#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2685#[rustc_legacy_const_generics(1)]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2688    static_assert_uimm_bits!(IMM8, 8);
2689    unsafe {
2690        if IMM8 >= 16 {
2691            _mm256_setzero_si256()
2692        } else {
2693            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2694        }
2695    }
2696}
2697
2698/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2699/// shifting in zeros, return the results;
2700///
2701/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2702#[inline]
2703#[target_feature(enable = "avx2")]
2704#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2705#[rustc_legacy_const_generics(1)]
2706#[stable(feature = "simd_x86", since = "1.27.0")]
2707pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2708    unsafe {
2709        static_assert_uimm_bits!(IMM8, 8);
2710        if IMM8 >= 32 {
2711            _mm256_setzero_si256()
2712        } else {
2713            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2714        }
2715    }
2716}
2717
2718/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2719/// shifting in zeros, return the results;
2720///
2721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2722#[inline]
2723#[target_feature(enable = "avx2")]
2724#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2725#[rustc_legacy_const_generics(1)]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2728    unsafe {
2729        static_assert_uimm_bits!(IMM8, 8);
2730        if IMM8 >= 64 {
2731            _mm256_setzero_si256()
2732        } else {
2733            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2734        }
2735    }
2736}
2737
2738/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2739///
2740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2741#[inline]
2742#[target_feature(enable = "avx2")]
2743#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2744#[rustc_legacy_const_generics(1)]
2745#[stable(feature = "simd_x86", since = "1.27.0")]
2746pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2747    static_assert_uimm_bits!(IMM8, 8);
2748    _mm256_bslli_epi128::<IMM8>(a)
2749}
2750
2751/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2752///
2753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2754#[inline]
2755#[target_feature(enable = "avx2")]
2756#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2757#[rustc_legacy_const_generics(1)]
2758#[stable(feature = "simd_x86", since = "1.27.0")]
2759pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2760    static_assert_uimm_bits!(IMM8, 8);
2761    const fn mask(shift: i32, i: u32) -> u32 {
2762        let shift = shift as u32 & 0xff;
2763        if shift > 15 || i % 16 < shift {
2764            0
2765        } else {
2766            32 + (i - shift)
2767        }
2768    }
2769    unsafe {
2770        let a = a.as_i8x32();
2771        let r: i8x32 = simd_shuffle!(
2772            i8x32::ZERO,
2773            a,
2774            [
2775                mask(IMM8, 0),
2776                mask(IMM8, 1),
2777                mask(IMM8, 2),
2778                mask(IMM8, 3),
2779                mask(IMM8, 4),
2780                mask(IMM8, 5),
2781                mask(IMM8, 6),
2782                mask(IMM8, 7),
2783                mask(IMM8, 8),
2784                mask(IMM8, 9),
2785                mask(IMM8, 10),
2786                mask(IMM8, 11),
2787                mask(IMM8, 12),
2788                mask(IMM8, 13),
2789                mask(IMM8, 14),
2790                mask(IMM8, 15),
2791                mask(IMM8, 16),
2792                mask(IMM8, 17),
2793                mask(IMM8, 18),
2794                mask(IMM8, 19),
2795                mask(IMM8, 20),
2796                mask(IMM8, 21),
2797                mask(IMM8, 22),
2798                mask(IMM8, 23),
2799                mask(IMM8, 24),
2800                mask(IMM8, 25),
2801                mask(IMM8, 26),
2802                mask(IMM8, 27),
2803                mask(IMM8, 28),
2804                mask(IMM8, 29),
2805                mask(IMM8, 30),
2806                mask(IMM8, 31),
2807            ],
2808        );
2809        transmute(r)
2810    }
2811}
2812
2813/// Shifts packed 32-bit integers in `a` left by the amount
2814/// specified by the corresponding element in `count` while
2815/// shifting in zeros, and returns the result.
2816///
2817/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2818#[inline]
2819#[target_feature(enable = "avx2")]
2820#[cfg_attr(test, assert_instr(vpsllvd))]
2821#[stable(feature = "simd_x86", since = "1.27.0")]
2822pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2823    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
2824}
2825
2826/// Shifts packed 32-bit integers in `a` left by the amount
2827/// specified by the corresponding element in `count` while
2828/// shifting in zeros, and returns the result.
2829///
2830/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2831#[inline]
2832#[target_feature(enable = "avx2")]
2833#[cfg_attr(test, assert_instr(vpsllvd))]
2834#[stable(feature = "simd_x86", since = "1.27.0")]
2835pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2836    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
2837}
2838
2839/// Shifts packed 64-bit integers in `a` left by the amount
2840/// specified by the corresponding element in `count` while
2841/// shifting in zeros, and returns the result.
2842///
2843/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2844#[inline]
2845#[target_feature(enable = "avx2")]
2846#[cfg_attr(test, assert_instr(vpsllvq))]
2847#[stable(feature = "simd_x86", since = "1.27.0")]
2848pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2849    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
2850}
2851
2852/// Shifts packed 64-bit integers in `a` left by the amount
2853/// specified by the corresponding element in `count` while
2854/// shifting in zeros, and returns the result.
2855///
2856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2857#[inline]
2858#[target_feature(enable = "avx2")]
2859#[cfg_attr(test, assert_instr(vpsllvq))]
2860#[stable(feature = "simd_x86", since = "1.27.0")]
2861pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2862    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
2863}
2864
2865/// Shifts packed 16-bit integers in `a` right by `count` while
2866/// shifting in sign bits.
2867///
2868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2869#[inline]
2870#[target_feature(enable = "avx2")]
2871#[cfg_attr(test, assert_instr(vpsraw))]
2872#[stable(feature = "simd_x86", since = "1.27.0")]
2873pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2874    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2875}
2876
2877/// Shifts packed 32-bit integers in `a` right by `count` while
2878/// shifting in sign bits.
2879///
2880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2881#[inline]
2882#[target_feature(enable = "avx2")]
2883#[cfg_attr(test, assert_instr(vpsrad))]
2884#[stable(feature = "simd_x86", since = "1.27.0")]
2885pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2886    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2887}
2888
2889/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2890/// shifting in sign bits.
2891///
2892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2893#[inline]
2894#[target_feature(enable = "avx2")]
2895#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2896#[rustc_legacy_const_generics(1)]
2897#[stable(feature = "simd_x86", since = "1.27.0")]
2898pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2899    static_assert_uimm_bits!(IMM8, 8);
2900    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2901}
2902
2903/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2904/// shifting in sign bits.
2905///
2906/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2907#[inline]
2908#[target_feature(enable = "avx2")]
2909#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2910#[rustc_legacy_const_generics(1)]
2911#[stable(feature = "simd_x86", since = "1.27.0")]
2912pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2913    static_assert_uimm_bits!(IMM8, 8);
2914    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2915}
2916
2917/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2918/// corresponding element in `count` while shifting in sign bits.
2919///
2920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2921#[inline]
2922#[target_feature(enable = "avx2")]
2923#[cfg_attr(test, assert_instr(vpsravd))]
2924#[stable(feature = "simd_x86", since = "1.27.0")]
2925pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2926    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
2927}
2928
2929/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2930/// corresponding element in `count` while shifting in sign bits.
2931///
2932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2933#[inline]
2934#[target_feature(enable = "avx2")]
2935#[cfg_attr(test, assert_instr(vpsravd))]
2936#[stable(feature = "simd_x86", since = "1.27.0")]
2937pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2938    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
2939}
2940
2941/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2942///
2943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2944#[inline]
2945#[target_feature(enable = "avx2")]
2946#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2947#[rustc_legacy_const_generics(1)]
2948#[stable(feature = "simd_x86", since = "1.27.0")]
2949pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2950    static_assert_uimm_bits!(IMM8, 8);
2951    _mm256_bsrli_epi128::<IMM8>(a)
2952}
2953
2954/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2955///
2956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2957#[inline]
2958#[target_feature(enable = "avx2")]
2959#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2960#[rustc_legacy_const_generics(1)]
2961#[stable(feature = "simd_x86", since = "1.27.0")]
2962pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2963    static_assert_uimm_bits!(IMM8, 8);
2964    unsafe {
2965        let a = a.as_i8x32();
2966        let zero = i8x32::ZERO;
2967        let r: i8x32 = match IMM8 % 16 {
2968            0 => simd_shuffle!(
2969                a,
2970                zero,
2971                [
2972                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
2973                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2974                ],
2975            ),
2976            1 => simd_shuffle!(
2977                a,
2978                zero,
2979                [
2980                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
2981                    23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2982                ],
2983            ),
2984            2 => simd_shuffle!(
2985                a,
2986                zero,
2987                [
2988                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23,
2989                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32,
2990                ],
2991            ),
2992            3 => simd_shuffle!(
2993                a,
2994                zero,
2995                [
2996                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23,
2997                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
2998                ],
2999            ),
3000            4 => simd_shuffle!(
3001                a,
3002                zero,
3003                [
3004                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24,
3005                    25, 26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
3006                ],
3007            ),
3008            5 => simd_shuffle!(
3009                a,
3010                zero,
3011                [
3012                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25,
3013                    26, 27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
3014                ],
3015            ),
3016            6 => simd_shuffle!(
3017                a,
3018                zero,
3019                [
3020                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26,
3021                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
3022                ],
3023            ),
3024            7 => simd_shuffle!(
3025                a,
3026                zero,
3027                [
3028                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26,
3029                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
3030                ],
3031            ),
3032            8 => simd_shuffle!(
3033                a,
3034                zero,
3035                [
3036                    8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27,
3037                    28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
3038                ],
3039            ),
3040            9 => simd_shuffle!(
3041                a,
3042                zero,
3043                [
3044                    9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28,
3045                    29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3046                ],
3047            ),
3048            10 => simd_shuffle!(
3049                a,
3050                zero,
3051                [
3052                    10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29,
3053                    30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3054                ],
3055            ),
3056            11 => simd_shuffle!(
3057                a,
3058                zero,
3059                [
3060                    11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30,
3061                    31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3062                ],
3063            ),
3064            12 => simd_shuffle!(
3065                a,
3066                zero,
3067                [
3068                    12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31,
3069                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3070                ],
3071            ),
3072            13 => simd_shuffle!(
3073                a,
3074                zero,
3075                [
3076                    13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32,
3077                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3078                ],
3079            ),
3080            14 => simd_shuffle!(
3081                a,
3082                zero,
3083                [
3084                    14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32,
3085                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3086                ],
3087            ),
3088            15 => simd_shuffle!(
3089                a,
3090                zero,
3091                [
3092                    15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
3093                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3094                ],
3095            ),
3096            _ => zero,
3097        };
3098        transmute(r)
3099    }
3100}
3101
3102/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3103/// zeros.
3104///
3105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3106#[inline]
3107#[target_feature(enable = "avx2")]
3108#[cfg_attr(test, assert_instr(vpsrlw))]
3109#[stable(feature = "simd_x86", since = "1.27.0")]
3110pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3111    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3112}
3113
3114/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3115/// zeros.
3116///
3117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3118#[inline]
3119#[target_feature(enable = "avx2")]
3120#[cfg_attr(test, assert_instr(vpsrld))]
3121#[stable(feature = "simd_x86", since = "1.27.0")]
3122pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3123    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3124}
3125
3126/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3127/// zeros.
3128///
3129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3130#[inline]
3131#[target_feature(enable = "avx2")]
3132#[cfg_attr(test, assert_instr(vpsrlq))]
3133#[stable(feature = "simd_x86", since = "1.27.0")]
3134pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3135    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3136}
3137
3138/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3139/// zeros
3140///
3141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3142#[inline]
3143#[target_feature(enable = "avx2")]
3144#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3145#[rustc_legacy_const_generics(1)]
3146#[stable(feature = "simd_x86", since = "1.27.0")]
3147pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3148    static_assert_uimm_bits!(IMM8, 8);
3149    unsafe {
3150        if IMM8 >= 16 {
3151            _mm256_setzero_si256()
3152        } else {
3153            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3154        }
3155    }
3156}
3157
3158/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3159/// zeros
3160///
3161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3162#[inline]
3163#[target_feature(enable = "avx2")]
3164#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3165#[rustc_legacy_const_generics(1)]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3168    static_assert_uimm_bits!(IMM8, 8);
3169    unsafe {
3170        if IMM8 >= 32 {
3171            _mm256_setzero_si256()
3172        } else {
3173            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3174        }
3175    }
3176}
3177
3178/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3179/// zeros
3180///
3181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3182#[inline]
3183#[target_feature(enable = "avx2")]
3184#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3185#[rustc_legacy_const_generics(1)]
3186#[stable(feature = "simd_x86", since = "1.27.0")]
3187pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3188    static_assert_uimm_bits!(IMM8, 8);
3189    unsafe {
3190        if IMM8 >= 64 {
3191            _mm256_setzero_si256()
3192        } else {
3193            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3194        }
3195    }
3196}
3197
3198/// Shifts packed 32-bit integers in `a` right by the amount specified by
3199/// the corresponding element in `count` while shifting in zeros,
3200///
3201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3202#[inline]
3203#[target_feature(enable = "avx2")]
3204#[cfg_attr(test, assert_instr(vpsrlvd))]
3205#[stable(feature = "simd_x86", since = "1.27.0")]
3206pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3207    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
3208}
3209
3210/// Shifts packed 32-bit integers in `a` right by the amount specified by
3211/// the corresponding element in `count` while shifting in zeros,
3212///
3213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3214#[inline]
3215#[target_feature(enable = "avx2")]
3216#[cfg_attr(test, assert_instr(vpsrlvd))]
3217#[stable(feature = "simd_x86", since = "1.27.0")]
3218pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3219    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
3220}
3221
3222/// Shifts packed 64-bit integers in `a` right by the amount specified by
3223/// the corresponding element in `count` while shifting in zeros,
3224///
3225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3226#[inline]
3227#[target_feature(enable = "avx2")]
3228#[cfg_attr(test, assert_instr(vpsrlvq))]
3229#[stable(feature = "simd_x86", since = "1.27.0")]
3230pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3231    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
3232}
3233
3234/// Shifts packed 64-bit integers in `a` right by the amount specified by
3235/// the corresponding element in `count` while shifting in zeros,
3236///
3237/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3238#[inline]
3239#[target_feature(enable = "avx2")]
3240#[cfg_attr(test, assert_instr(vpsrlvq))]
3241#[stable(feature = "simd_x86", since = "1.27.0")]
3242pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3243    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
3244}
3245
3246/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3247/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3248/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3249///
3250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3251#[inline]
3252#[target_feature(enable = "avx2")]
3253#[cfg_attr(test, assert_instr(vmovntdqa))]
3254#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3255pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3256    let dst: __m256i;
3257    crate::arch::asm!(
3258        vpl!("vmovntdqa {a}"),
3259        a = out(ymm_reg) dst,
3260        p = in(reg) mem_addr,
3261        options(pure, readonly, nostack, preserves_flags),
3262    );
3263    dst
3264}
3265
3266/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3267///
3268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3269#[inline]
3270#[target_feature(enable = "avx2")]
3271#[cfg_attr(test, assert_instr(vpsubw))]
3272#[stable(feature = "simd_x86", since = "1.27.0")]
3273pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3274    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3275}
3276
3277/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3278///
3279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3280#[inline]
3281#[target_feature(enable = "avx2")]
3282#[cfg_attr(test, assert_instr(vpsubd))]
3283#[stable(feature = "simd_x86", since = "1.27.0")]
3284pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3285    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3286}
3287
3288/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3289///
3290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3291#[inline]
3292#[target_feature(enable = "avx2")]
3293#[cfg_attr(test, assert_instr(vpsubq))]
3294#[stable(feature = "simd_x86", since = "1.27.0")]
3295pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3296    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3297}
3298
3299/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3300///
3301/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3302#[inline]
3303#[target_feature(enable = "avx2")]
3304#[cfg_attr(test, assert_instr(vpsubb))]
3305#[stable(feature = "simd_x86", since = "1.27.0")]
3306pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3307    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3308}
3309
3310/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3311/// `a` using saturation.
3312///
3313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3314#[inline]
3315#[target_feature(enable = "avx2")]
3316#[cfg_attr(test, assert_instr(vpsubsw))]
3317#[stable(feature = "simd_x86", since = "1.27.0")]
3318pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3319    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3320}
3321
3322/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3323/// `a` using saturation.
3324///
3325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3326#[inline]
3327#[target_feature(enable = "avx2")]
3328#[cfg_attr(test, assert_instr(vpsubsb))]
3329#[stable(feature = "simd_x86", since = "1.27.0")]
3330pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3331    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3332}
3333
3334/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3335/// integers in `a` using saturation.
3336///
3337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3338#[inline]
3339#[target_feature(enable = "avx2")]
3340#[cfg_attr(test, assert_instr(vpsubusw))]
3341#[stable(feature = "simd_x86", since = "1.27.0")]
3342pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3343    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3344}
3345
3346/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3347/// integers in `a` using saturation.
3348///
3349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3350#[inline]
3351#[target_feature(enable = "avx2")]
3352#[cfg_attr(test, assert_instr(vpsubusb))]
3353#[stable(feature = "simd_x86", since = "1.27.0")]
3354pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3355    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3356}
3357
3358/// Unpacks and interleave 8-bit integers from the high half of each
3359/// 128-bit lane in `a` and `b`.
3360///
3361/// ```rust
3362/// #[cfg(target_arch = "x86")]
3363/// use std::arch::x86::*;
3364/// #[cfg(target_arch = "x86_64")]
3365/// use std::arch::x86_64::*;
3366///
3367/// # fn main() {
3368/// #     if is_x86_feature_detected!("avx2") {
3369/// #         #[target_feature(enable = "avx2")]
3370/// #         unsafe fn worker() {
3371/// let a = _mm256_setr_epi8(
3372///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3373///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3374/// );
3375/// let b = _mm256_setr_epi8(
3376///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3377///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3378///     -30, -31,
3379/// );
3380///
3381/// let c = _mm256_unpackhi_epi8(a, b);
3382///
3383/// let expected = _mm256_setr_epi8(
3384///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3385///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3386///     -31,
3387/// );
3388/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3389///
3390/// #         }
3391/// #         unsafe { worker(); }
3392/// #     }
3393/// # }
3394/// ```
3395///
3396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3397#[inline]
3398#[target_feature(enable = "avx2")]
3399#[cfg_attr(test, assert_instr(vpunpckhbw))]
3400#[stable(feature = "simd_x86", since = "1.27.0")]
3401pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3402    unsafe {
3403        #[rustfmt::skip]
3404        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3405                8, 40, 9, 41, 10, 42, 11, 43,
3406                12, 44, 13, 45, 14, 46, 15, 47,
3407                24, 56, 25, 57, 26, 58, 27, 59,
3408                28, 60, 29, 61, 30, 62, 31, 63,
3409        ]);
3410        transmute(r)
3411    }
3412}
3413
3414/// Unpacks and interleave 8-bit integers from the low half of each
3415/// 128-bit lane of `a` and `b`.
3416///
3417/// ```rust
3418/// #[cfg(target_arch = "x86")]
3419/// use std::arch::x86::*;
3420/// #[cfg(target_arch = "x86_64")]
3421/// use std::arch::x86_64::*;
3422///
3423/// # fn main() {
3424/// #     if is_x86_feature_detected!("avx2") {
3425/// #         #[target_feature(enable = "avx2")]
3426/// #         unsafe fn worker() {
3427/// let a = _mm256_setr_epi8(
3428///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3429///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3430/// );
3431/// let b = _mm256_setr_epi8(
3432///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3433///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3434///     -30, -31,
3435/// );
3436///
3437/// let c = _mm256_unpacklo_epi8(a, b);
3438///
3439/// let expected = _mm256_setr_epi8(
3440///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3441///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3442/// );
3443/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3444///
3445/// #         }
3446/// #         unsafe { worker(); }
3447/// #     }
3448/// # }
3449/// ```
3450///
3451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3452#[inline]
3453#[target_feature(enable = "avx2")]
3454#[cfg_attr(test, assert_instr(vpunpcklbw))]
3455#[stable(feature = "simd_x86", since = "1.27.0")]
3456pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3457    unsafe {
3458        #[rustfmt::skip]
3459        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3460            0, 32, 1, 33, 2, 34, 3, 35,
3461            4, 36, 5, 37, 6, 38, 7, 39,
3462            16, 48, 17, 49, 18, 50, 19, 51,
3463            20, 52, 21, 53, 22, 54, 23, 55,
3464        ]);
3465        transmute(r)
3466    }
3467}
3468
3469/// Unpacks and interleave 16-bit integers from the high half of each
3470/// 128-bit lane of `a` and `b`.
3471///
3472/// ```rust
3473/// #[cfg(target_arch = "x86")]
3474/// use std::arch::x86::*;
3475/// #[cfg(target_arch = "x86_64")]
3476/// use std::arch::x86_64::*;
3477///
3478/// # fn main() {
3479/// #     if is_x86_feature_detected!("avx2") {
3480/// #         #[target_feature(enable = "avx2")]
3481/// #         unsafe fn worker() {
3482/// let a = _mm256_setr_epi16(
3483///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3484/// );
3485/// let b = _mm256_setr_epi16(
3486///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3487/// );
3488///
3489/// let c = _mm256_unpackhi_epi16(a, b);
3490///
3491/// let expected = _mm256_setr_epi16(
3492///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3493/// );
3494/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3495///
3496/// #         }
3497/// #         unsafe { worker(); }
3498/// #     }
3499/// # }
3500/// ```
3501///
3502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3503#[inline]
3504#[target_feature(enable = "avx2")]
3505#[cfg_attr(test, assert_instr(vpunpckhwd))]
3506#[stable(feature = "simd_x86", since = "1.27.0")]
3507pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3508    unsafe {
3509        let r: i16x16 = simd_shuffle!(
3510            a.as_i16x16(),
3511            b.as_i16x16(),
3512            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3513        );
3514        transmute(r)
3515    }
3516}
3517
3518/// Unpacks and interleave 16-bit integers from the low half of each
3519/// 128-bit lane of `a` and `b`.
3520///
3521/// ```rust
3522/// #[cfg(target_arch = "x86")]
3523/// use std::arch::x86::*;
3524/// #[cfg(target_arch = "x86_64")]
3525/// use std::arch::x86_64::*;
3526///
3527/// # fn main() {
3528/// #     if is_x86_feature_detected!("avx2") {
3529/// #         #[target_feature(enable = "avx2")]
3530/// #         unsafe fn worker() {
3531///
3532/// let a = _mm256_setr_epi16(
3533///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3534/// );
3535/// let b = _mm256_setr_epi16(
3536///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3537/// );
3538///
3539/// let c = _mm256_unpacklo_epi16(a, b);
3540///
3541/// let expected = _mm256_setr_epi16(
3542///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3543/// );
3544/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3545///
3546/// #         }
3547/// #         unsafe { worker(); }
3548/// #     }
3549/// # }
3550/// ```
3551///
3552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3553#[inline]
3554#[target_feature(enable = "avx2")]
3555#[cfg_attr(test, assert_instr(vpunpcklwd))]
3556#[stable(feature = "simd_x86", since = "1.27.0")]
3557pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3558    unsafe {
3559        let r: i16x16 = simd_shuffle!(
3560            a.as_i16x16(),
3561            b.as_i16x16(),
3562            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3563        );
3564        transmute(r)
3565    }
3566}
3567
3568/// Unpacks and interleave 32-bit integers from the high half of each
3569/// 128-bit lane of `a` and `b`.
3570///
3571/// ```rust
3572/// #[cfg(target_arch = "x86")]
3573/// use std::arch::x86::*;
3574/// #[cfg(target_arch = "x86_64")]
3575/// use std::arch::x86_64::*;
3576///
3577/// # fn main() {
3578/// #     if is_x86_feature_detected!("avx2") {
3579/// #         #[target_feature(enable = "avx2")]
3580/// #         unsafe fn worker() {
3581/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3582/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3583///
3584/// let c = _mm256_unpackhi_epi32(a, b);
3585///
3586/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3587/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3588///
3589/// #         }
3590/// #         unsafe { worker(); }
3591/// #     }
3592/// # }
3593/// ```
3594///
3595/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3596#[inline]
3597#[target_feature(enable = "avx2")]
3598#[cfg_attr(test, assert_instr(vunpckhps))]
3599#[stable(feature = "simd_x86", since = "1.27.0")]
3600pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3601    unsafe {
3602        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3603        transmute(r)
3604    }
3605}
3606
3607/// Unpacks and interleave 32-bit integers from the low half of each
3608/// 128-bit lane of `a` and `b`.
3609///
3610/// ```rust
3611/// #[cfg(target_arch = "x86")]
3612/// use std::arch::x86::*;
3613/// #[cfg(target_arch = "x86_64")]
3614/// use std::arch::x86_64::*;
3615///
3616/// # fn main() {
3617/// #     if is_x86_feature_detected!("avx2") {
3618/// #         #[target_feature(enable = "avx2")]
3619/// #         unsafe fn worker() {
3620/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3621/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3622///
3623/// let c = _mm256_unpacklo_epi32(a, b);
3624///
3625/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3626/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3627///
3628/// #         }
3629/// #         unsafe { worker(); }
3630/// #     }
3631/// # }
3632/// ```
3633///
3634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3635#[inline]
3636#[target_feature(enable = "avx2")]
3637#[cfg_attr(test, assert_instr(vunpcklps))]
3638#[stable(feature = "simd_x86", since = "1.27.0")]
3639pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3640    unsafe {
3641        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3642        transmute(r)
3643    }
3644}
3645
3646/// Unpacks and interleave 64-bit integers from the high half of each
3647/// 128-bit lane of `a` and `b`.
3648///
3649/// ```rust
3650/// #[cfg(target_arch = "x86")]
3651/// use std::arch::x86::*;
3652/// #[cfg(target_arch = "x86_64")]
3653/// use std::arch::x86_64::*;
3654///
3655/// # fn main() {
3656/// #     if is_x86_feature_detected!("avx2") {
3657/// #         #[target_feature(enable = "avx2")]
3658/// #         unsafe fn worker() {
3659/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3660/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3661///
3662/// let c = _mm256_unpackhi_epi64(a, b);
3663///
3664/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3665/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3666///
3667/// #         }
3668/// #         unsafe { worker(); }
3669/// #     }
3670/// # }
3671/// ```
3672///
3673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3674#[inline]
3675#[target_feature(enable = "avx2")]
3676#[cfg_attr(test, assert_instr(vunpckhpd))]
3677#[stable(feature = "simd_x86", since = "1.27.0")]
3678pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3679    unsafe {
3680        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3681        transmute(r)
3682    }
3683}
3684
3685/// Unpacks and interleave 64-bit integers from the low half of each
3686/// 128-bit lane of `a` and `b`.
3687///
3688/// ```rust
3689/// #[cfg(target_arch = "x86")]
3690/// use std::arch::x86::*;
3691/// #[cfg(target_arch = "x86_64")]
3692/// use std::arch::x86_64::*;
3693///
3694/// # fn main() {
3695/// #     if is_x86_feature_detected!("avx2") {
3696/// #         #[target_feature(enable = "avx2")]
3697/// #         unsafe fn worker() {
3698/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3699/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3700///
3701/// let c = _mm256_unpacklo_epi64(a, b);
3702///
3703/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3704/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3705///
3706/// #         }
3707/// #         unsafe { worker(); }
3708/// #     }
3709/// # }
3710/// ```
3711///
3712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3713#[inline]
3714#[target_feature(enable = "avx2")]
3715#[cfg_attr(test, assert_instr(vunpcklpd))]
3716#[stable(feature = "simd_x86", since = "1.27.0")]
3717pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3718    unsafe {
3719        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3720        transmute(r)
3721    }
3722}
3723
3724/// Computes the bitwise XOR of 256 bits (representing integer data)
3725/// in `a` and `b`
3726///
3727/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3728#[inline]
3729#[target_feature(enable = "avx2")]
3730#[cfg_attr(test, assert_instr(vxorps))]
3731#[stable(feature = "simd_x86", since = "1.27.0")]
3732pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3733    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3734}
3735
3736/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3737/// integer containing the zero-extended integer data.
3738///
3739/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3740///
3741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3742#[inline]
3743#[target_feature(enable = "avx2")]
3744// This intrinsic has no corresponding instruction.
3745#[rustc_legacy_const_generics(1)]
3746#[stable(feature = "simd_x86", since = "1.27.0")]
3747pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3748    static_assert_uimm_bits!(INDEX, 5);
3749    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3750}
3751
3752/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3753/// integer containing the zero-extended integer data.
3754///
3755/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3756///
3757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3758#[inline]
3759#[target_feature(enable = "avx2")]
3760// This intrinsic has no corresponding instruction.
3761#[rustc_legacy_const_generics(1)]
3762#[stable(feature = "simd_x86", since = "1.27.0")]
3763pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3764    static_assert_uimm_bits!(INDEX, 4);
3765    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3766}
3767
3768#[allow(improper_ctypes)]
3769unsafe extern "C" {
3770    #[link_name = "llvm.x86.avx2.phadd.w"]
3771    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3772    #[link_name = "llvm.x86.avx2.phadd.d"]
3773    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3774    #[link_name = "llvm.x86.avx2.phadd.sw"]
3775    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3776    #[link_name = "llvm.x86.avx2.phsub.w"]
3777    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3778    #[link_name = "llvm.x86.avx2.phsub.d"]
3779    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3780    #[link_name = "llvm.x86.avx2.phsub.sw"]
3781    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3782    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3783    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3784    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3785    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3786    #[link_name = "llvm.x86.avx2.maskload.d"]
3787    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3788    #[link_name = "llvm.x86.avx2.maskload.d.256"]
3789    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3790    #[link_name = "llvm.x86.avx2.maskload.q"]
3791    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3792    #[link_name = "llvm.x86.avx2.maskload.q.256"]
3793    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3794    #[link_name = "llvm.x86.avx2.maskstore.d"]
3795    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3796    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3797    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3798    #[link_name = "llvm.x86.avx2.maskstore.q"]
3799    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3800    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3801    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3802    #[link_name = "llvm.x86.avx2.mpsadbw"]
3803    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3804    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3805    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3806    #[link_name = "llvm.x86.avx2.packsswb"]
3807    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3808    #[link_name = "llvm.x86.avx2.packssdw"]
3809    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3810    #[link_name = "llvm.x86.avx2.packuswb"]
3811    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3812    #[link_name = "llvm.x86.avx2.packusdw"]
3813    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3814    #[link_name = "llvm.x86.avx2.psad.bw"]
3815    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3816    #[link_name = "llvm.x86.avx2.psign.b"]
3817    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3818    #[link_name = "llvm.x86.avx2.psign.w"]
3819    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3820    #[link_name = "llvm.x86.avx2.psign.d"]
3821    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3822    #[link_name = "llvm.x86.avx2.psll.w"]
3823    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3824    #[link_name = "llvm.x86.avx2.psll.d"]
3825    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3826    #[link_name = "llvm.x86.avx2.psll.q"]
3827    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3828    #[link_name = "llvm.x86.avx2.psllv.d"]
3829    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3830    #[link_name = "llvm.x86.avx2.psllv.d.256"]
3831    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3832    #[link_name = "llvm.x86.avx2.psllv.q"]
3833    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3834    #[link_name = "llvm.x86.avx2.psllv.q.256"]
3835    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3836    #[link_name = "llvm.x86.avx2.psra.w"]
3837    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3838    #[link_name = "llvm.x86.avx2.psra.d"]
3839    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3840    #[link_name = "llvm.x86.avx2.psrav.d"]
3841    fn psravd(a: i32x4, count: i32x4) -> i32x4;
3842    #[link_name = "llvm.x86.avx2.psrav.d.256"]
3843    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3844    #[link_name = "llvm.x86.avx2.psrl.w"]
3845    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3846    #[link_name = "llvm.x86.avx2.psrl.d"]
3847    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3848    #[link_name = "llvm.x86.avx2.psrl.q"]
3849    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3850    #[link_name = "llvm.x86.avx2.psrlv.d"]
3851    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3852    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3853    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3854    #[link_name = "llvm.x86.avx2.psrlv.q"]
3855    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3856    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3857    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3858    #[link_name = "llvm.x86.avx2.pshuf.b"]
3859    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3860    #[link_name = "llvm.x86.avx2.permd"]
3861    fn permd(a: u32x8, b: u32x8) -> u32x8;
3862    #[link_name = "llvm.x86.avx2.permps"]
3863    fn permps(a: __m256, b: i32x8) -> __m256;
3864    #[link_name = "llvm.x86.avx2.vperm2i128"]
3865    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3866    #[link_name = "llvm.x86.avx2.gather.d.d"]
3867    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3868    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3869    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3870    #[link_name = "llvm.x86.avx2.gather.d.q"]
3871    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3872    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3873    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3874    #[link_name = "llvm.x86.avx2.gather.q.d"]
3875    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3876    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3877    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3878    #[link_name = "llvm.x86.avx2.gather.q.q"]
3879    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3880    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3881    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3882    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3883    fn pgatherdpd(
3884        src: __m128d,
3885        slice: *const i8,
3886        offsets: i32x4,
3887        mask: __m128d,
3888        scale: i8,
3889    ) -> __m128d;
3890    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3891    fn vpgatherdpd(
3892        src: __m256d,
3893        slice: *const i8,
3894        offsets: i32x4,
3895        mask: __m256d,
3896        scale: i8,
3897    ) -> __m256d;
3898    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3899    fn pgatherqpd(
3900        src: __m128d,
3901        slice: *const i8,
3902        offsets: i64x2,
3903        mask: __m128d,
3904        scale: i8,
3905    ) -> __m128d;
3906    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3907    fn vpgatherqpd(
3908        src: __m256d,
3909        slice: *const i8,
3910        offsets: i64x4,
3911        mask: __m256d,
3912        scale: i8,
3913    ) -> __m256d;
3914    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3915    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3916    -> __m128;
3917    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3918    fn vpgatherdps(
3919        src: __m256,
3920        slice: *const i8,
3921        offsets: i32x8,
3922        mask: __m256,
3923        scale: i8,
3924    ) -> __m256;
3925    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3926    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3927    -> __m128;
3928    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3929    fn vpgatherqps(
3930        src: __m128,
3931        slice: *const i8,
3932        offsets: i64x4,
3933        mask: __m128,
3934        scale: i8,
3935    ) -> __m128;
3936}
3937
3938#[cfg(test)]
3939mod tests {
3940
3941    use stdarch_test::simd_test;
3942
3943    use crate::core_arch::x86::*;
3944
3945    #[simd_test(enable = "avx2")]
3946    unsafe fn test_mm256_abs_epi32() {
3947        #[rustfmt::skip]
3948        let a = _mm256_setr_epi32(
3949            0, 1, -1, i32::MAX,
3950            i32::MIN, 100, -100, -32,
3951        );
3952        let r = _mm256_abs_epi32(a);
3953        #[rustfmt::skip]
3954        let e = _mm256_setr_epi32(
3955            0, 1, 1, i32::MAX,
3956            i32::MAX.wrapping_add(1), 100, 100, 32,
3957        );
3958        assert_eq_m256i(r, e);
3959    }
3960
3961    #[simd_test(enable = "avx2")]
3962    unsafe fn test_mm256_abs_epi16() {
3963        #[rustfmt::skip]
3964        let a = _mm256_setr_epi16(
3965            0,  1, -1, 2, -2, 3, -3, 4,
3966            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3967        );
3968        let r = _mm256_abs_epi16(a);
3969        #[rustfmt::skip]
3970        let e = _mm256_setr_epi16(
3971            0, 1, 1, 2, 2, 3, 3, 4,
3972            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3973        );
3974        assert_eq_m256i(r, e);
3975    }
3976
3977    #[simd_test(enable = "avx2")]
3978    unsafe fn test_mm256_abs_epi8() {
3979        #[rustfmt::skip]
3980        let a = _mm256_setr_epi8(
3981            0, 1, -1, 2, -2, 3, -3, 4,
3982            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3983            0, 1, -1, 2, -2, 3, -3, 4,
3984            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3985        );
3986        let r = _mm256_abs_epi8(a);
3987        #[rustfmt::skip]
3988        let e = _mm256_setr_epi8(
3989            0, 1, 1, 2, 2, 3, 3, 4,
3990            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3991            0, 1, 1, 2, 2, 3, 3, 4,
3992            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3993        );
3994        assert_eq_m256i(r, e);
3995    }
3996
3997    #[simd_test(enable = "avx2")]
3998    unsafe fn test_mm256_add_epi64() {
3999        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4000        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4001        let r = _mm256_add_epi64(a, b);
4002        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4003        assert_eq_m256i(r, e);
4004    }
4005
4006    #[simd_test(enable = "avx2")]
4007    unsafe fn test_mm256_add_epi32() {
4008        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4009        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4010        let r = _mm256_add_epi32(a, b);
4011        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4012        assert_eq_m256i(r, e);
4013    }
4014
4015    #[simd_test(enable = "avx2")]
4016    unsafe fn test_mm256_add_epi16() {
4017        #[rustfmt::skip]
4018        let a = _mm256_setr_epi16(
4019            0, 1, 2, 3, 4, 5, 6, 7,
4020            8, 9, 10, 11, 12, 13, 14, 15,
4021        );
4022        #[rustfmt::skip]
4023        let b = _mm256_setr_epi16(
4024            0, 1, 2, 3, 4, 5, 6, 7,
4025            8, 9, 10, 11, 12, 13, 14, 15,
4026        );
4027        let r = _mm256_add_epi16(a, b);
4028        #[rustfmt::skip]
4029        let e = _mm256_setr_epi16(
4030            0, 2, 4, 6, 8, 10, 12, 14,
4031            16, 18, 20, 22, 24, 26, 28, 30,
4032        );
4033        assert_eq_m256i(r, e);
4034    }
4035
4036    #[simd_test(enable = "avx2")]
4037    unsafe fn test_mm256_add_epi8() {
4038        #[rustfmt::skip]
4039        let a = _mm256_setr_epi8(
4040            0, 1, 2, 3, 4, 5, 6, 7,
4041            8, 9, 10, 11, 12, 13, 14, 15,
4042            16, 17, 18, 19, 20, 21, 22, 23,
4043            24, 25, 26, 27, 28, 29, 30, 31,
4044        );
4045        #[rustfmt::skip]
4046        let b = _mm256_setr_epi8(
4047            0, 1, 2, 3, 4, 5, 6, 7,
4048            8, 9, 10, 11, 12, 13, 14, 15,
4049            16, 17, 18, 19, 20, 21, 22, 23,
4050            24, 25, 26, 27, 28, 29, 30, 31,
4051        );
4052        let r = _mm256_add_epi8(a, b);
4053        #[rustfmt::skip]
4054        let e = _mm256_setr_epi8(
4055            0, 2, 4, 6, 8, 10, 12, 14,
4056            16, 18, 20, 22, 24, 26, 28, 30,
4057            32, 34, 36, 38, 40, 42, 44, 46,
4058            48, 50, 52, 54, 56, 58, 60, 62,
4059        );
4060        assert_eq_m256i(r, e);
4061    }
4062
4063    #[simd_test(enable = "avx2")]
4064    unsafe fn test_mm256_adds_epi8() {
4065        #[rustfmt::skip]
4066        let a = _mm256_setr_epi8(
4067            0, 1, 2, 3, 4, 5, 6, 7,
4068            8, 9, 10, 11, 12, 13, 14, 15,
4069            16, 17, 18, 19, 20, 21, 22, 23,
4070            24, 25, 26, 27, 28, 29, 30, 31,
4071        );
4072        #[rustfmt::skip]
4073        let b = _mm256_setr_epi8(
4074            32, 33, 34, 35, 36, 37, 38, 39,
4075            40, 41, 42, 43, 44, 45, 46, 47,
4076            48, 49, 50, 51, 52, 53, 54, 55,
4077            56, 57, 58, 59, 60, 61, 62, 63,
4078        );
4079        let r = _mm256_adds_epi8(a, b);
4080        #[rustfmt::skip]
4081        let e = _mm256_setr_epi8(
4082            32, 34, 36, 38, 40, 42, 44, 46,
4083            48, 50, 52, 54, 56, 58, 60, 62,
4084            64, 66, 68, 70, 72, 74, 76, 78,
4085            80, 82, 84, 86, 88, 90, 92, 94,
4086        );
4087        assert_eq_m256i(r, e);
4088    }
4089
4090    #[simd_test(enable = "avx2")]
4091    unsafe fn test_mm256_adds_epi8_saturate_positive() {
4092        let a = _mm256_set1_epi8(0x7F);
4093        let b = _mm256_set1_epi8(1);
4094        let r = _mm256_adds_epi8(a, b);
4095        assert_eq_m256i(r, a);
4096    }
4097
4098    #[simd_test(enable = "avx2")]
4099    unsafe fn test_mm256_adds_epi8_saturate_negative() {
4100        let a = _mm256_set1_epi8(-0x80);
4101        let b = _mm256_set1_epi8(-1);
4102        let r = _mm256_adds_epi8(a, b);
4103        assert_eq_m256i(r, a);
4104    }
4105
4106    #[simd_test(enable = "avx2")]
4107    unsafe fn test_mm256_adds_epi16() {
4108        #[rustfmt::skip]
4109        let a = _mm256_setr_epi16(
4110            0, 1, 2, 3, 4, 5, 6, 7,
4111            8, 9, 10, 11, 12, 13, 14, 15,
4112        );
4113        #[rustfmt::skip]
4114        let b = _mm256_setr_epi16(
4115            32, 33, 34, 35, 36, 37, 38, 39,
4116            40, 41, 42, 43, 44, 45, 46, 47,
4117        );
4118        let r = _mm256_adds_epi16(a, b);
4119        #[rustfmt::skip]
4120        let e = _mm256_setr_epi16(
4121            32, 34, 36, 38, 40, 42, 44, 46,
4122            48, 50, 52, 54, 56, 58, 60, 62,
4123        );
4124
4125        assert_eq_m256i(r, e);
4126    }
4127
4128    #[simd_test(enable = "avx2")]
4129    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4130        let a = _mm256_set1_epi16(0x7FFF);
4131        let b = _mm256_set1_epi16(1);
4132        let r = _mm256_adds_epi16(a, b);
4133        assert_eq_m256i(r, a);
4134    }
4135
4136    #[simd_test(enable = "avx2")]
4137    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4138        let a = _mm256_set1_epi16(-0x8000);
4139        let b = _mm256_set1_epi16(-1);
4140        let r = _mm256_adds_epi16(a, b);
4141        assert_eq_m256i(r, a);
4142    }
4143
4144    #[simd_test(enable = "avx2")]
4145    unsafe fn test_mm256_adds_epu8() {
4146        #[rustfmt::skip]
4147        let a = _mm256_setr_epi8(
4148            0, 1, 2, 3, 4, 5, 6, 7,
4149            8, 9, 10, 11, 12, 13, 14, 15,
4150            16, 17, 18, 19, 20, 21, 22, 23,
4151            24, 25, 26, 27, 28, 29, 30, 31,
4152        );
4153        #[rustfmt::skip]
4154        let b = _mm256_setr_epi8(
4155            32, 33, 34, 35, 36, 37, 38, 39,
4156            40, 41, 42, 43, 44, 45, 46, 47,
4157            48, 49, 50, 51, 52, 53, 54, 55,
4158            56, 57, 58, 59, 60, 61, 62, 63,
4159        );
4160        let r = _mm256_adds_epu8(a, b);
4161        #[rustfmt::skip]
4162        let e = _mm256_setr_epi8(
4163            32, 34, 36, 38, 40, 42, 44, 46,
4164            48, 50, 52, 54, 56, 58, 60, 62,
4165            64, 66, 68, 70, 72, 74, 76, 78,
4166            80, 82, 84, 86, 88, 90, 92, 94,
4167        );
4168        assert_eq_m256i(r, e);
4169    }
4170
4171    #[simd_test(enable = "avx2")]
4172    unsafe fn test_mm256_adds_epu8_saturate() {
4173        let a = _mm256_set1_epi8(!0);
4174        let b = _mm256_set1_epi8(1);
4175        let r = _mm256_adds_epu8(a, b);
4176        assert_eq_m256i(r, a);
4177    }
4178
4179    #[simd_test(enable = "avx2")]
4180    unsafe fn test_mm256_adds_epu16() {
4181        #[rustfmt::skip]
4182        let a = _mm256_setr_epi16(
4183            0, 1, 2, 3, 4, 5, 6, 7,
4184            8, 9, 10, 11, 12, 13, 14, 15,
4185        );
4186        #[rustfmt::skip]
4187        let b = _mm256_setr_epi16(
4188            32, 33, 34, 35, 36, 37, 38, 39,
4189            40, 41, 42, 43, 44, 45, 46, 47,
4190        );
4191        let r = _mm256_adds_epu16(a, b);
4192        #[rustfmt::skip]
4193        let e = _mm256_setr_epi16(
4194            32, 34, 36, 38, 40, 42, 44, 46,
4195            48, 50, 52, 54, 56, 58, 60, 62,
4196        );
4197
4198        assert_eq_m256i(r, e);
4199    }
4200
4201    #[simd_test(enable = "avx2")]
4202    unsafe fn test_mm256_adds_epu16_saturate() {
4203        let a = _mm256_set1_epi16(!0);
4204        let b = _mm256_set1_epi16(1);
4205        let r = _mm256_adds_epu16(a, b);
4206        assert_eq_m256i(r, a);
4207    }
4208
4209    #[simd_test(enable = "avx2")]
4210    unsafe fn test_mm256_and_si256() {
4211        let a = _mm256_set1_epi8(5);
4212        let b = _mm256_set1_epi8(3);
4213        let got = _mm256_and_si256(a, b);
4214        assert_eq_m256i(got, _mm256_set1_epi8(1));
4215    }
4216
4217    #[simd_test(enable = "avx2")]
4218    unsafe fn test_mm256_andnot_si256() {
4219        let a = _mm256_set1_epi8(5);
4220        let b = _mm256_set1_epi8(3);
4221        let got = _mm256_andnot_si256(a, b);
4222        assert_eq_m256i(got, _mm256_set1_epi8(2));
4223    }
4224
4225    #[simd_test(enable = "avx2")]
4226    unsafe fn test_mm256_avg_epu8() {
4227        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4228        let r = _mm256_avg_epu8(a, b);
4229        assert_eq_m256i(r, _mm256_set1_epi8(6));
4230    }
4231
4232    #[simd_test(enable = "avx2")]
4233    unsafe fn test_mm256_avg_epu16() {
4234        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4235        let r = _mm256_avg_epu16(a, b);
4236        assert_eq_m256i(r, _mm256_set1_epi16(6));
4237    }
4238
4239    #[simd_test(enable = "avx2")]
4240    unsafe fn test_mm_blend_epi32() {
4241        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4242        let e = _mm_setr_epi32(9, 3, 3, 3);
4243        let r = _mm_blend_epi32::<0x01>(a, b);
4244        assert_eq_m128i(r, e);
4245
4246        let r = _mm_blend_epi32::<0x0E>(b, a);
4247        assert_eq_m128i(r, e);
4248    }
4249
4250    #[simd_test(enable = "avx2")]
4251    unsafe fn test_mm256_blend_epi32() {
4252        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4253        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4254        let r = _mm256_blend_epi32::<0x01>(a, b);
4255        assert_eq_m256i(r, e);
4256
4257        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4258        let r = _mm256_blend_epi32::<0x82>(a, b);
4259        assert_eq_m256i(r, e);
4260
4261        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4262        let r = _mm256_blend_epi32::<0x7C>(a, b);
4263        assert_eq_m256i(r, e);
4264    }
4265
4266    #[simd_test(enable = "avx2")]
4267    unsafe fn test_mm256_blend_epi16() {
4268        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4269        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4270        let r = _mm256_blend_epi16::<0x01>(a, b);
4271        assert_eq_m256i(r, e);
4272
4273        let r = _mm256_blend_epi16::<0xFE>(b, a);
4274        assert_eq_m256i(r, e);
4275    }
4276
4277    #[simd_test(enable = "avx2")]
4278    unsafe fn test_mm256_blendv_epi8() {
4279        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4280        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4281        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4282        let r = _mm256_blendv_epi8(a, b, mask);
4283        assert_eq_m256i(r, e);
4284    }
4285
4286    #[simd_test(enable = "avx2")]
4287    unsafe fn test_mm_broadcastb_epi8() {
4288        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4289        let res = _mm_broadcastb_epi8(a);
4290        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4291    }
4292
4293    #[simd_test(enable = "avx2")]
4294    unsafe fn test_mm256_broadcastb_epi8() {
4295        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4296        let res = _mm256_broadcastb_epi8(a);
4297        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4298    }
4299
4300    #[simd_test(enable = "avx2")]
4301    unsafe fn test_mm_broadcastd_epi32() {
4302        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4303        let res = _mm_broadcastd_epi32(a);
4304        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4305    }
4306
4307    #[simd_test(enable = "avx2")]
4308    unsafe fn test_mm256_broadcastd_epi32() {
4309        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4310        let res = _mm256_broadcastd_epi32(a);
4311        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4312    }
4313
4314    #[simd_test(enable = "avx2")]
4315    unsafe fn test_mm_broadcastq_epi64() {
4316        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4317        let res = _mm_broadcastq_epi64(a);
4318        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4319    }
4320
4321    #[simd_test(enable = "avx2")]
4322    unsafe fn test_mm256_broadcastq_epi64() {
4323        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4324        let res = _mm256_broadcastq_epi64(a);
4325        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4326    }
4327
4328    #[simd_test(enable = "avx2")]
4329    unsafe fn test_mm_broadcastsd_pd() {
4330        let a = _mm_setr_pd(6.88, 3.44);
4331        let res = _mm_broadcastsd_pd(a);
4332        assert_eq_m128d(res, _mm_set1_pd(6.88));
4333    }
4334
4335    #[simd_test(enable = "avx2")]
4336    unsafe fn test_mm256_broadcastsd_pd() {
4337        let a = _mm_setr_pd(6.88, 3.44);
4338        let res = _mm256_broadcastsd_pd(a);
4339        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4340    }
4341
4342    #[simd_test(enable = "avx2")]
4343    unsafe fn test_mm_broadcastsi128_si256() {
4344        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4345        let res = _mm_broadcastsi128_si256(a);
4346        let retval = _mm256_setr_epi64x(
4347            0x0987654321012334,
4348            0x5678909876543210,
4349            0x0987654321012334,
4350            0x5678909876543210,
4351        );
4352        assert_eq_m256i(res, retval);
4353    }
4354
4355    #[simd_test(enable = "avx2")]
4356    unsafe fn test_mm256_broadcastsi128_si256() {
4357        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4358        let res = _mm256_broadcastsi128_si256(a);
4359        let retval = _mm256_setr_epi64x(
4360            0x0987654321012334,
4361            0x5678909876543210,
4362            0x0987654321012334,
4363            0x5678909876543210,
4364        );
4365        assert_eq_m256i(res, retval);
4366    }
4367
4368    #[simd_test(enable = "avx2")]
4369    unsafe fn test_mm_broadcastss_ps() {
4370        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4371        let res = _mm_broadcastss_ps(a);
4372        assert_eq_m128(res, _mm_set1_ps(6.88));
4373    }
4374
4375    #[simd_test(enable = "avx2")]
4376    unsafe fn test_mm256_broadcastss_ps() {
4377        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4378        let res = _mm256_broadcastss_ps(a);
4379        assert_eq_m256(res, _mm256_set1_ps(6.88));
4380    }
4381
4382    #[simd_test(enable = "avx2")]
4383    unsafe fn test_mm_broadcastw_epi16() {
4384        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4385        let res = _mm_broadcastw_epi16(a);
4386        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4387    }
4388
4389    #[simd_test(enable = "avx2")]
4390    unsafe fn test_mm256_broadcastw_epi16() {
4391        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4392        let res = _mm256_broadcastw_epi16(a);
4393        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4394    }
4395
4396    #[simd_test(enable = "avx2")]
4397    unsafe fn test_mm256_cmpeq_epi8() {
4398        #[rustfmt::skip]
4399        let a = _mm256_setr_epi8(
4400            0, 1, 2, 3, 4, 5, 6, 7,
4401            8, 9, 10, 11, 12, 13, 14, 15,
4402            16, 17, 18, 19, 20, 21, 22, 23,
4403            24, 25, 26, 27, 28, 29, 30, 31,
4404        );
4405        #[rustfmt::skip]
4406        let b = _mm256_setr_epi8(
4407            31, 30, 2, 28, 27, 26, 25, 24,
4408            23, 22, 21, 20, 19, 18, 17, 16,
4409            15, 14, 13, 12, 11, 10, 9, 8,
4410            7, 6, 5, 4, 3, 2, 1, 0,
4411        );
4412        let r = _mm256_cmpeq_epi8(a, b);
4413        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4414    }
4415
4416    #[simd_test(enable = "avx2")]
4417    unsafe fn test_mm256_cmpeq_epi16() {
4418        #[rustfmt::skip]
4419        let a = _mm256_setr_epi16(
4420            0, 1, 2, 3, 4, 5, 6, 7,
4421            8, 9, 10, 11, 12, 13, 14, 15,
4422        );
4423        #[rustfmt::skip]
4424        let b = _mm256_setr_epi16(
4425            15, 14, 2, 12, 11, 10, 9, 8,
4426            7, 6, 5, 4, 3, 2, 1, 0,
4427        );
4428        let r = _mm256_cmpeq_epi16(a, b);
4429        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4430    }
4431
4432    #[simd_test(enable = "avx2")]
4433    unsafe fn test_mm256_cmpeq_epi32() {
4434        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4435        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4436        let r = _mm256_cmpeq_epi32(a, b);
4437        let e = _mm256_set1_epi32(0);
4438        let e = _mm256_insert_epi32::<2>(e, !0);
4439        assert_eq_m256i(r, e);
4440    }
4441
4442    #[simd_test(enable = "avx2")]
4443    unsafe fn test_mm256_cmpeq_epi64() {
4444        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4445        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4446        let r = _mm256_cmpeq_epi64(a, b);
4447        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4448    }
4449
4450    #[simd_test(enable = "avx2")]
4451    unsafe fn test_mm256_cmpgt_epi8() {
4452        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4453        let b = _mm256_set1_epi8(0);
4454        let r = _mm256_cmpgt_epi8(a, b);
4455        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4456    }
4457
4458    #[simd_test(enable = "avx2")]
4459    unsafe fn test_mm256_cmpgt_epi16() {
4460        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4461        let b = _mm256_set1_epi16(0);
4462        let r = _mm256_cmpgt_epi16(a, b);
4463        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4464    }
4465
4466    #[simd_test(enable = "avx2")]
4467    unsafe fn test_mm256_cmpgt_epi32() {
4468        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4469        let b = _mm256_set1_epi32(0);
4470        let r = _mm256_cmpgt_epi32(a, b);
4471        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4472    }
4473
4474    #[simd_test(enable = "avx2")]
4475    unsafe fn test_mm256_cmpgt_epi64() {
4476        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4477        let b = _mm256_set1_epi64x(0);
4478        let r = _mm256_cmpgt_epi64(a, b);
4479        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4480    }
4481
4482    #[simd_test(enable = "avx2")]
4483    unsafe fn test_mm256_cvtepi8_epi16() {
4484        #[rustfmt::skip]
4485        let a = _mm_setr_epi8(
4486            0, 0, -1, 1, -2, 2, -3, 3,
4487            -4, 4, -5, 5, -6, 6, -7, 7,
4488        );
4489        #[rustfmt::skip]
4490        let r = _mm256_setr_epi16(
4491            0, 0, -1, 1, -2, 2, -3, 3,
4492            -4, 4, -5, 5, -6, 6, -7, 7,
4493        );
4494        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4495    }
4496
4497    #[simd_test(enable = "avx2")]
4498    unsafe fn test_mm256_cvtepi8_epi32() {
4499        #[rustfmt::skip]
4500        let a = _mm_setr_epi8(
4501            0, 0, -1, 1, -2, 2, -3, 3,
4502            -4, 4, -5, 5, -6, 6, -7, 7,
4503        );
4504        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4505        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4506    }
4507
4508    #[simd_test(enable = "avx2")]
4509    unsafe fn test_mm256_cvtepi8_epi64() {
4510        #[rustfmt::skip]
4511        let a = _mm_setr_epi8(
4512            0, 0, -1, 1, -2, 2, -3, 3,
4513            -4, 4, -5, 5, -6, 6, -7, 7,
4514        );
4515        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4516        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4517    }
4518
4519    #[simd_test(enable = "avx2")]
4520    unsafe fn test_mm256_cvtepi16_epi32() {
4521        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4522        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4523        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4524    }
4525
4526    #[simd_test(enable = "avx2")]
4527    unsafe fn test_mm256_cvtepi16_epi64() {
4528        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4529        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4530        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4531    }
4532
4533    #[simd_test(enable = "avx2")]
4534    unsafe fn test_mm256_cvtepi32_epi64() {
4535        let a = _mm_setr_epi32(0, 0, -1, 1);
4536        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4537        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4538    }
4539
4540    #[simd_test(enable = "avx2")]
4541    unsafe fn test_mm256_cvtepu16_epi32() {
4542        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4543        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4544        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4545    }
4546
4547    #[simd_test(enable = "avx2")]
4548    unsafe fn test_mm256_cvtepu16_epi64() {
4549        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4550        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4551        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4552    }
4553
4554    #[simd_test(enable = "avx2")]
4555    unsafe fn test_mm256_cvtepu32_epi64() {
4556        let a = _mm_setr_epi32(0, 1, 2, 3);
4557        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4558        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4559    }
4560
4561    #[simd_test(enable = "avx2")]
4562    unsafe fn test_mm256_cvtepu8_epi16() {
4563        #[rustfmt::skip]
4564        let a = _mm_setr_epi8(
4565            0, 1, 2, 3, 4, 5, 6, 7,
4566            8, 9, 10, 11, 12, 13, 14, 15,
4567        );
4568        #[rustfmt::skip]
4569        let r = _mm256_setr_epi16(
4570            0, 1, 2, 3, 4, 5, 6, 7,
4571            8, 9, 10, 11, 12, 13, 14, 15,
4572        );
4573        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4574    }
4575
4576    #[simd_test(enable = "avx2")]
4577    unsafe fn test_mm256_cvtepu8_epi32() {
4578        #[rustfmt::skip]
4579        let a = _mm_setr_epi8(
4580            0, 1, 2, 3, 4, 5, 6, 7,
4581            8, 9, 10, 11, 12, 13, 14, 15,
4582        );
4583        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4584        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4585    }
4586
4587    #[simd_test(enable = "avx2")]
4588    unsafe fn test_mm256_cvtepu8_epi64() {
4589        #[rustfmt::skip]
4590        let a = _mm_setr_epi8(
4591            0, 1, 2, 3, 4, 5, 6, 7,
4592            8, 9, 10, 11, 12, 13, 14, 15,
4593        );
4594        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4595        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4596    }
4597
4598    #[simd_test(enable = "avx2")]
4599    unsafe fn test_mm256_extracti128_si256() {
4600        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4601        let r = _mm256_extracti128_si256::<1>(a);
4602        let e = _mm_setr_epi64x(3, 4);
4603        assert_eq_m128i(r, e);
4604    }
4605
4606    #[simd_test(enable = "avx2")]
4607    unsafe fn test_mm256_hadd_epi16() {
4608        let a = _mm256_set1_epi16(2);
4609        let b = _mm256_set1_epi16(4);
4610        let r = _mm256_hadd_epi16(a, b);
4611        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4612        assert_eq_m256i(r, e);
4613    }
4614
4615    #[simd_test(enable = "avx2")]
4616    unsafe fn test_mm256_hadd_epi32() {
4617        let a = _mm256_set1_epi32(2);
4618        let b = _mm256_set1_epi32(4);
4619        let r = _mm256_hadd_epi32(a, b);
4620        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4621        assert_eq_m256i(r, e);
4622    }
4623
4624    #[simd_test(enable = "avx2")]
4625    unsafe fn test_mm256_hadds_epi16() {
4626        let a = _mm256_set1_epi16(2);
4627        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4628        let a = _mm256_insert_epi16::<1>(a, 1);
4629        let b = _mm256_set1_epi16(4);
4630        let r = _mm256_hadds_epi16(a, b);
4631        #[rustfmt::skip]
4632        let e = _mm256_setr_epi16(
4633            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4634            4, 4, 4, 4, 8, 8, 8, 8,
4635        );
4636        assert_eq_m256i(r, e);
4637    }
4638
4639    #[simd_test(enable = "avx2")]
4640    unsafe fn test_mm256_hsub_epi16() {
4641        let a = _mm256_set1_epi16(2);
4642        let b = _mm256_set1_epi16(4);
4643        let r = _mm256_hsub_epi16(a, b);
4644        let e = _mm256_set1_epi16(0);
4645        assert_eq_m256i(r, e);
4646    }
4647
4648    #[simd_test(enable = "avx2")]
4649    unsafe fn test_mm256_hsub_epi32() {
4650        let a = _mm256_set1_epi32(2);
4651        let b = _mm256_set1_epi32(4);
4652        let r = _mm256_hsub_epi32(a, b);
4653        let e = _mm256_set1_epi32(0);
4654        assert_eq_m256i(r, e);
4655    }
4656
4657    #[simd_test(enable = "avx2")]
4658    unsafe fn test_mm256_hsubs_epi16() {
4659        let a = _mm256_set1_epi16(2);
4660        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4661        let a = _mm256_insert_epi16::<1>(a, -1);
4662        let b = _mm256_set1_epi16(4);
4663        let r = _mm256_hsubs_epi16(a, b);
4664        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4665        assert_eq_m256i(r, e);
4666    }
4667
4668    #[simd_test(enable = "avx2")]
4669    unsafe fn test_mm256_madd_epi16() {
4670        let a = _mm256_set1_epi16(2);
4671        let b = _mm256_set1_epi16(4);
4672        let r = _mm256_madd_epi16(a, b);
4673        let e = _mm256_set1_epi32(16);
4674        assert_eq_m256i(r, e);
4675    }
4676
4677    #[simd_test(enable = "avx2")]
4678    unsafe fn test_mm256_inserti128_si256() {
4679        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4680        let b = _mm_setr_epi64x(7, 8);
4681        let r = _mm256_inserti128_si256::<1>(a, b);
4682        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4683        assert_eq_m256i(r, e);
4684    }
4685
4686    #[simd_test(enable = "avx2")]
4687    unsafe fn test_mm256_maddubs_epi16() {
4688        let a = _mm256_set1_epi8(2);
4689        let b = _mm256_set1_epi8(4);
4690        let r = _mm256_maddubs_epi16(a, b);
4691        let e = _mm256_set1_epi16(16);
4692        assert_eq_m256i(r, e);
4693    }
4694
4695    #[simd_test(enable = "avx2")]
4696    unsafe fn test_mm_maskload_epi32() {
4697        let nums = [1, 2, 3, 4];
4698        let a = &nums as *const i32;
4699        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4700        let r = _mm_maskload_epi32(a, mask);
4701        let e = _mm_setr_epi32(1, 0, 0, 4);
4702        assert_eq_m128i(r, e);
4703    }
4704
4705    #[simd_test(enable = "avx2")]
4706    unsafe fn test_mm256_maskload_epi32() {
4707        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4708        let a = &nums as *const i32;
4709        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4710        let r = _mm256_maskload_epi32(a, mask);
4711        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4712        assert_eq_m256i(r, e);
4713    }
4714
4715    #[simd_test(enable = "avx2")]
4716    unsafe fn test_mm_maskload_epi64() {
4717        let nums = [1_i64, 2_i64];
4718        let a = &nums as *const i64;
4719        let mask = _mm_setr_epi64x(0, -1);
4720        let r = _mm_maskload_epi64(a, mask);
4721        let e = _mm_setr_epi64x(0, 2);
4722        assert_eq_m128i(r, e);
4723    }
4724
4725    #[simd_test(enable = "avx2")]
4726    unsafe fn test_mm256_maskload_epi64() {
4727        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4728        let a = &nums as *const i64;
4729        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4730        let r = _mm256_maskload_epi64(a, mask);
4731        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4732        assert_eq_m256i(r, e);
4733    }
4734
4735    #[simd_test(enable = "avx2")]
4736    unsafe fn test_mm_maskstore_epi32() {
4737        let a = _mm_setr_epi32(1, 2, 3, 4);
4738        let mut arr = [-1, -1, -1, -1];
4739        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4740        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4741        let e = [1, -1, -1, 4];
4742        assert_eq!(arr, e);
4743    }
4744
4745    #[simd_test(enable = "avx2")]
4746    unsafe fn test_mm256_maskstore_epi32() {
4747        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4748        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4749        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4750        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4751        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4752        assert_eq!(arr, e);
4753    }
4754
4755    #[simd_test(enable = "avx2")]
4756    unsafe fn test_mm_maskstore_epi64() {
4757        let a = _mm_setr_epi64x(1_i64, 2_i64);
4758        let mut arr = [-1_i64, -1_i64];
4759        let mask = _mm_setr_epi64x(0, -1);
4760        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4761        let e = [-1, 2];
4762        assert_eq!(arr, e);
4763    }
4764
4765    #[simd_test(enable = "avx2")]
4766    unsafe fn test_mm256_maskstore_epi64() {
4767        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4768        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4769        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4770        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4771        let e = [-1, 2, 3, -1];
4772        assert_eq!(arr, e);
4773    }
4774
4775    #[simd_test(enable = "avx2")]
4776    unsafe fn test_mm256_max_epi16() {
4777        let a = _mm256_set1_epi16(2);
4778        let b = _mm256_set1_epi16(4);
4779        let r = _mm256_max_epi16(a, b);
4780        assert_eq_m256i(r, b);
4781    }
4782
4783    #[simd_test(enable = "avx2")]
4784    unsafe fn test_mm256_max_epi32() {
4785        let a = _mm256_set1_epi32(2);
4786        let b = _mm256_set1_epi32(4);
4787        let r = _mm256_max_epi32(a, b);
4788        assert_eq_m256i(r, b);
4789    }
4790
4791    #[simd_test(enable = "avx2")]
4792    unsafe fn test_mm256_max_epi8() {
4793        let a = _mm256_set1_epi8(2);
4794        let b = _mm256_set1_epi8(4);
4795        let r = _mm256_max_epi8(a, b);
4796        assert_eq_m256i(r, b);
4797    }
4798
4799    #[simd_test(enable = "avx2")]
4800    unsafe fn test_mm256_max_epu16() {
4801        let a = _mm256_set1_epi16(2);
4802        let b = _mm256_set1_epi16(4);
4803        let r = _mm256_max_epu16(a, b);
4804        assert_eq_m256i(r, b);
4805    }
4806
4807    #[simd_test(enable = "avx2")]
4808    unsafe fn test_mm256_max_epu32() {
4809        let a = _mm256_set1_epi32(2);
4810        let b = _mm256_set1_epi32(4);
4811        let r = _mm256_max_epu32(a, b);
4812        assert_eq_m256i(r, b);
4813    }
4814
4815    #[simd_test(enable = "avx2")]
4816    unsafe fn test_mm256_max_epu8() {
4817        let a = _mm256_set1_epi8(2);
4818        let b = _mm256_set1_epi8(4);
4819        let r = _mm256_max_epu8(a, b);
4820        assert_eq_m256i(r, b);
4821    }
4822
4823    #[simd_test(enable = "avx2")]
4824    unsafe fn test_mm256_min_epi16() {
4825        let a = _mm256_set1_epi16(2);
4826        let b = _mm256_set1_epi16(4);
4827        let r = _mm256_min_epi16(a, b);
4828        assert_eq_m256i(r, a);
4829    }
4830
4831    #[simd_test(enable = "avx2")]
4832    unsafe fn test_mm256_min_epi32() {
4833        let a = _mm256_set1_epi32(2);
4834        let b = _mm256_set1_epi32(4);
4835        let r = _mm256_min_epi32(a, b);
4836        assert_eq_m256i(r, a);
4837    }
4838
4839    #[simd_test(enable = "avx2")]
4840    unsafe fn test_mm256_min_epi8() {
4841        let a = _mm256_set1_epi8(2);
4842        let b = _mm256_set1_epi8(4);
4843        let r = _mm256_min_epi8(a, b);
4844        assert_eq_m256i(r, a);
4845    }
4846
4847    #[simd_test(enable = "avx2")]
4848    unsafe fn test_mm256_min_epu16() {
4849        let a = _mm256_set1_epi16(2);
4850        let b = _mm256_set1_epi16(4);
4851        let r = _mm256_min_epu16(a, b);
4852        assert_eq_m256i(r, a);
4853    }
4854
4855    #[simd_test(enable = "avx2")]
4856    unsafe fn test_mm256_min_epu32() {
4857        let a = _mm256_set1_epi32(2);
4858        let b = _mm256_set1_epi32(4);
4859        let r = _mm256_min_epu32(a, b);
4860        assert_eq_m256i(r, a);
4861    }
4862
4863    #[simd_test(enable = "avx2")]
4864    unsafe fn test_mm256_min_epu8() {
4865        let a = _mm256_set1_epi8(2);
4866        let b = _mm256_set1_epi8(4);
4867        let r = _mm256_min_epu8(a, b);
4868        assert_eq_m256i(r, a);
4869    }
4870
4871    #[simd_test(enable = "avx2")]
4872    unsafe fn test_mm256_movemask_epi8() {
4873        let a = _mm256_set1_epi8(-1);
4874        let r = _mm256_movemask_epi8(a);
4875        let e = -1;
4876        assert_eq!(r, e);
4877    }
4878
4879    #[simd_test(enable = "avx2")]
4880    unsafe fn test_mm256_mpsadbw_epu8() {
4881        let a = _mm256_set1_epi8(2);
4882        let b = _mm256_set1_epi8(4);
4883        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4884        let e = _mm256_set1_epi16(8);
4885        assert_eq_m256i(r, e);
4886    }
4887
4888    #[simd_test(enable = "avx2")]
4889    unsafe fn test_mm256_mul_epi32() {
4890        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4891        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4892        let r = _mm256_mul_epi32(a, b);
4893        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4894        assert_eq_m256i(r, e);
4895    }
4896
4897    #[simd_test(enable = "avx2")]
4898    unsafe fn test_mm256_mul_epu32() {
4899        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4900        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4901        let r = _mm256_mul_epu32(a, b);
4902        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4903        assert_eq_m256i(r, e);
4904    }
4905
4906    #[simd_test(enable = "avx2")]
4907    unsafe fn test_mm256_mulhi_epi16() {
4908        let a = _mm256_set1_epi16(6535);
4909        let b = _mm256_set1_epi16(6535);
4910        let r = _mm256_mulhi_epi16(a, b);
4911        let e = _mm256_set1_epi16(651);
4912        assert_eq_m256i(r, e);
4913    }
4914
4915    #[simd_test(enable = "avx2")]
4916    unsafe fn test_mm256_mulhi_epu16() {
4917        let a = _mm256_set1_epi16(6535);
4918        let b = _mm256_set1_epi16(6535);
4919        let r = _mm256_mulhi_epu16(a, b);
4920        let e = _mm256_set1_epi16(651);
4921        assert_eq_m256i(r, e);
4922    }
4923
4924    #[simd_test(enable = "avx2")]
4925    unsafe fn test_mm256_mullo_epi16() {
4926        let a = _mm256_set1_epi16(2);
4927        let b = _mm256_set1_epi16(4);
4928        let r = _mm256_mullo_epi16(a, b);
4929        let e = _mm256_set1_epi16(8);
4930        assert_eq_m256i(r, e);
4931    }
4932
4933    #[simd_test(enable = "avx2")]
4934    unsafe fn test_mm256_mullo_epi32() {
4935        let a = _mm256_set1_epi32(2);
4936        let b = _mm256_set1_epi32(4);
4937        let r = _mm256_mullo_epi32(a, b);
4938        let e = _mm256_set1_epi32(8);
4939        assert_eq_m256i(r, e);
4940    }
4941
4942    #[simd_test(enable = "avx2")]
4943    unsafe fn test_mm256_mulhrs_epi16() {
4944        let a = _mm256_set1_epi16(2);
4945        let b = _mm256_set1_epi16(4);
4946        let r = _mm256_mullo_epi16(a, b);
4947        let e = _mm256_set1_epi16(8);
4948        assert_eq_m256i(r, e);
4949    }
4950
4951    #[simd_test(enable = "avx2")]
4952    unsafe fn test_mm256_or_si256() {
4953        let a = _mm256_set1_epi8(-1);
4954        let b = _mm256_set1_epi8(0);
4955        let r = _mm256_or_si256(a, b);
4956        assert_eq_m256i(r, a);
4957    }
4958
4959    #[simd_test(enable = "avx2")]
4960    unsafe fn test_mm256_packs_epi16() {
4961        let a = _mm256_set1_epi16(2);
4962        let b = _mm256_set1_epi16(4);
4963        let r = _mm256_packs_epi16(a, b);
4964        #[rustfmt::skip]
4965        let e = _mm256_setr_epi8(
4966            2, 2, 2, 2, 2, 2, 2, 2,
4967            4, 4, 4, 4, 4, 4, 4, 4,
4968            2, 2, 2, 2, 2, 2, 2, 2,
4969            4, 4, 4, 4, 4, 4, 4, 4,
4970        );
4971
4972        assert_eq_m256i(r, e);
4973    }
4974
4975    #[simd_test(enable = "avx2")]
4976    unsafe fn test_mm256_packs_epi32() {
4977        let a = _mm256_set1_epi32(2);
4978        let b = _mm256_set1_epi32(4);
4979        let r = _mm256_packs_epi32(a, b);
4980        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4981
4982        assert_eq_m256i(r, e);
4983    }
4984
4985    #[simd_test(enable = "avx2")]
4986    unsafe fn test_mm256_packus_epi16() {
4987        let a = _mm256_set1_epi16(2);
4988        let b = _mm256_set1_epi16(4);
4989        let r = _mm256_packus_epi16(a, b);
4990        #[rustfmt::skip]
4991        let e = _mm256_setr_epi8(
4992            2, 2, 2, 2, 2, 2, 2, 2,
4993            4, 4, 4, 4, 4, 4, 4, 4,
4994            2, 2, 2, 2, 2, 2, 2, 2,
4995            4, 4, 4, 4, 4, 4, 4, 4,
4996        );
4997
4998        assert_eq_m256i(r, e);
4999    }
5000
5001    #[simd_test(enable = "avx2")]
5002    unsafe fn test_mm256_packus_epi32() {
5003        let a = _mm256_set1_epi32(2);
5004        let b = _mm256_set1_epi32(4);
5005        let r = _mm256_packus_epi32(a, b);
5006        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5007
5008        assert_eq_m256i(r, e);
5009    }
5010
5011    #[simd_test(enable = "avx2")]
5012    unsafe fn test_mm256_sad_epu8() {
5013        let a = _mm256_set1_epi8(2);
5014        let b = _mm256_set1_epi8(4);
5015        let r = _mm256_sad_epu8(a, b);
5016        let e = _mm256_set1_epi64x(16);
5017        assert_eq_m256i(r, e);
5018    }
5019
5020    #[simd_test(enable = "avx2")]
5021    unsafe fn test_mm256_shufflehi_epi16() {
5022        #[rustfmt::skip]
5023        let a = _mm256_setr_epi16(
5024            0, 1, 2, 3, 11, 22, 33, 44,
5025            4, 5, 6, 7, 55, 66, 77, 88,
5026        );
5027        #[rustfmt::skip]
5028        let e = _mm256_setr_epi16(
5029            0, 1, 2, 3, 44, 22, 22, 11,
5030            4, 5, 6, 7, 88, 66, 66, 55,
5031        );
5032        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5033        assert_eq_m256i(r, e);
5034    }
5035
5036    #[simd_test(enable = "avx2")]
5037    unsafe fn test_mm256_shufflelo_epi16() {
5038        #[rustfmt::skip]
5039        let a = _mm256_setr_epi16(
5040            11, 22, 33, 44, 0, 1, 2, 3,
5041            55, 66, 77, 88, 4, 5, 6, 7,
5042        );
5043        #[rustfmt::skip]
5044        let e = _mm256_setr_epi16(
5045            44, 22, 22, 11, 0, 1, 2, 3,
5046            88, 66, 66, 55, 4, 5, 6, 7,
5047        );
5048        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5049        assert_eq_m256i(r, e);
5050    }
5051
5052    #[simd_test(enable = "avx2")]
5053    unsafe fn test_mm256_sign_epi16() {
5054        let a = _mm256_set1_epi16(2);
5055        let b = _mm256_set1_epi16(-1);
5056        let r = _mm256_sign_epi16(a, b);
5057        let e = _mm256_set1_epi16(-2);
5058        assert_eq_m256i(r, e);
5059    }
5060
5061    #[simd_test(enable = "avx2")]
5062    unsafe fn test_mm256_sign_epi32() {
5063        let a = _mm256_set1_epi32(2);
5064        let b = _mm256_set1_epi32(-1);
5065        let r = _mm256_sign_epi32(a, b);
5066        let e = _mm256_set1_epi32(-2);
5067        assert_eq_m256i(r, e);
5068    }
5069
5070    #[simd_test(enable = "avx2")]
5071    unsafe fn test_mm256_sign_epi8() {
5072        let a = _mm256_set1_epi8(2);
5073        let b = _mm256_set1_epi8(-1);
5074        let r = _mm256_sign_epi8(a, b);
5075        let e = _mm256_set1_epi8(-2);
5076        assert_eq_m256i(r, e);
5077    }
5078
5079    #[simd_test(enable = "avx2")]
5080    unsafe fn test_mm256_sll_epi16() {
5081        let a = _mm256_set1_epi16(0xFF);
5082        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5083        let r = _mm256_sll_epi16(a, b);
5084        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5085    }
5086
5087    #[simd_test(enable = "avx2")]
5088    unsafe fn test_mm256_sll_epi32() {
5089        let a = _mm256_set1_epi32(0xFFFF);
5090        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5091        let r = _mm256_sll_epi32(a, b);
5092        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5093    }
5094
5095    #[simd_test(enable = "avx2")]
5096    unsafe fn test_mm256_sll_epi64() {
5097        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5098        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5099        let r = _mm256_sll_epi64(a, b);
5100        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5101    }
5102
5103    #[simd_test(enable = "avx2")]
5104    unsafe fn test_mm256_slli_epi16() {
5105        assert_eq_m256i(
5106            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5107            _mm256_set1_epi16(0xFF0),
5108        );
5109    }
5110
5111    #[simd_test(enable = "avx2")]
5112    unsafe fn test_mm256_slli_epi32() {
5113        assert_eq_m256i(
5114            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5115            _mm256_set1_epi32(0xFFFF0),
5116        );
5117    }
5118
5119    #[simd_test(enable = "avx2")]
5120    unsafe fn test_mm256_slli_epi64() {
5121        assert_eq_m256i(
5122            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5123            _mm256_set1_epi64x(0xFFFFFFFF0),
5124        );
5125    }
5126
5127    #[simd_test(enable = "avx2")]
5128    unsafe fn test_mm256_slli_si256() {
5129        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5130        let r = _mm256_slli_si256::<3>(a);
5131        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5132    }
5133
5134    #[simd_test(enable = "avx2")]
5135    unsafe fn test_mm_sllv_epi32() {
5136        let a = _mm_set1_epi32(2);
5137        let b = _mm_set1_epi32(1);
5138        let r = _mm_sllv_epi32(a, b);
5139        let e = _mm_set1_epi32(4);
5140        assert_eq_m128i(r, e);
5141    }
5142
5143    #[simd_test(enable = "avx2")]
5144    unsafe fn test_mm256_sllv_epi32() {
5145        let a = _mm256_set1_epi32(2);
5146        let b = _mm256_set1_epi32(1);
5147        let r = _mm256_sllv_epi32(a, b);
5148        let e = _mm256_set1_epi32(4);
5149        assert_eq_m256i(r, e);
5150    }
5151
5152    #[simd_test(enable = "avx2")]
5153    unsafe fn test_mm_sllv_epi64() {
5154        let a = _mm_set1_epi64x(2);
5155        let b = _mm_set1_epi64x(1);
5156        let r = _mm_sllv_epi64(a, b);
5157        let e = _mm_set1_epi64x(4);
5158        assert_eq_m128i(r, e);
5159    }
5160
5161    #[simd_test(enable = "avx2")]
5162    unsafe fn test_mm256_sllv_epi64() {
5163        let a = _mm256_set1_epi64x(2);
5164        let b = _mm256_set1_epi64x(1);
5165        let r = _mm256_sllv_epi64(a, b);
5166        let e = _mm256_set1_epi64x(4);
5167        assert_eq_m256i(r, e);
5168    }
5169
5170    #[simd_test(enable = "avx2")]
5171    unsafe fn test_mm256_sra_epi16() {
5172        let a = _mm256_set1_epi16(-1);
5173        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5174        let r = _mm256_sra_epi16(a, b);
5175        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5176    }
5177
5178    #[simd_test(enable = "avx2")]
5179    unsafe fn test_mm256_sra_epi32() {
5180        let a = _mm256_set1_epi32(-1);
5181        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5182        let r = _mm256_sra_epi32(a, b);
5183        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5184    }
5185
5186    #[simd_test(enable = "avx2")]
5187    unsafe fn test_mm256_srai_epi16() {
5188        assert_eq_m256i(
5189            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5190            _mm256_set1_epi16(-1),
5191        );
5192    }
5193
5194    #[simd_test(enable = "avx2")]
5195    unsafe fn test_mm256_srai_epi32() {
5196        assert_eq_m256i(
5197            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5198            _mm256_set1_epi32(-1),
5199        );
5200    }
5201
5202    #[simd_test(enable = "avx2")]
5203    unsafe fn test_mm_srav_epi32() {
5204        let a = _mm_set1_epi32(4);
5205        let count = _mm_set1_epi32(1);
5206        let r = _mm_srav_epi32(a, count);
5207        let e = _mm_set1_epi32(2);
5208        assert_eq_m128i(r, e);
5209    }
5210
5211    #[simd_test(enable = "avx2")]
5212    unsafe fn test_mm256_srav_epi32() {
5213        let a = _mm256_set1_epi32(4);
5214        let count = _mm256_set1_epi32(1);
5215        let r = _mm256_srav_epi32(a, count);
5216        let e = _mm256_set1_epi32(2);
5217        assert_eq_m256i(r, e);
5218    }
5219
5220    #[simd_test(enable = "avx2")]
5221    unsafe fn test_mm256_srli_si256() {
5222        #[rustfmt::skip]
5223        let a = _mm256_setr_epi8(
5224            1, 2, 3, 4, 5, 6, 7, 8,
5225            9, 10, 11, 12, 13, 14, 15, 16,
5226            17, 18, 19, 20, 21, 22, 23, 24,
5227            25, 26, 27, 28, 29, 30, 31, 32,
5228        );
5229        let r = _mm256_srli_si256::<3>(a);
5230        #[rustfmt::skip]
5231        let e = _mm256_setr_epi8(
5232            4, 5, 6, 7, 8, 9, 10, 11,
5233            12, 13, 14, 15, 16, 0, 0, 0,
5234            20, 21, 22, 23, 24, 25, 26, 27,
5235            28, 29, 30, 31, 32, 0, 0, 0,
5236        );
5237        assert_eq_m256i(r, e);
5238    }
5239
5240    #[simd_test(enable = "avx2")]
5241    unsafe fn test_mm256_srl_epi16() {
5242        let a = _mm256_set1_epi16(0xFF);
5243        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5244        let r = _mm256_srl_epi16(a, b);
5245        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5246    }
5247
5248    #[simd_test(enable = "avx2")]
5249    unsafe fn test_mm256_srl_epi32() {
5250        let a = _mm256_set1_epi32(0xFFFF);
5251        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5252        let r = _mm256_srl_epi32(a, b);
5253        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5254    }
5255
5256    #[simd_test(enable = "avx2")]
5257    unsafe fn test_mm256_srl_epi64() {
5258        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5259        let b = _mm_setr_epi64x(4, 0);
5260        let r = _mm256_srl_epi64(a, b);
5261        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5262    }
5263
5264    #[simd_test(enable = "avx2")]
5265    unsafe fn test_mm256_srli_epi16() {
5266        assert_eq_m256i(
5267            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5268            _mm256_set1_epi16(0xF),
5269        );
5270    }
5271
5272    #[simd_test(enable = "avx2")]
5273    unsafe fn test_mm256_srli_epi32() {
5274        assert_eq_m256i(
5275            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5276            _mm256_set1_epi32(0xFFF),
5277        );
5278    }
5279
5280    #[simd_test(enable = "avx2")]
5281    unsafe fn test_mm256_srli_epi64() {
5282        assert_eq_m256i(
5283            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5284            _mm256_set1_epi64x(0xFFFFFFF),
5285        );
5286    }
5287
5288    #[simd_test(enable = "avx2")]
5289    unsafe fn test_mm_srlv_epi32() {
5290        let a = _mm_set1_epi32(2);
5291        let count = _mm_set1_epi32(1);
5292        let r = _mm_srlv_epi32(a, count);
5293        let e = _mm_set1_epi32(1);
5294        assert_eq_m128i(r, e);
5295    }
5296
5297    #[simd_test(enable = "avx2")]
5298    unsafe fn test_mm256_srlv_epi32() {
5299        let a = _mm256_set1_epi32(2);
5300        let count = _mm256_set1_epi32(1);
5301        let r = _mm256_srlv_epi32(a, count);
5302        let e = _mm256_set1_epi32(1);
5303        assert_eq_m256i(r, e);
5304    }
5305
5306    #[simd_test(enable = "avx2")]
5307    unsafe fn test_mm_srlv_epi64() {
5308        let a = _mm_set1_epi64x(2);
5309        let count = _mm_set1_epi64x(1);
5310        let r = _mm_srlv_epi64(a, count);
5311        let e = _mm_set1_epi64x(1);
5312        assert_eq_m128i(r, e);
5313    }
5314
5315    #[simd_test(enable = "avx2")]
5316    unsafe fn test_mm256_srlv_epi64() {
5317        let a = _mm256_set1_epi64x(2);
5318        let count = _mm256_set1_epi64x(1);
5319        let r = _mm256_srlv_epi64(a, count);
5320        let e = _mm256_set1_epi64x(1);
5321        assert_eq_m256i(r, e);
5322    }
5323
5324    #[simd_test(enable = "avx2")]
5325    unsafe fn test_mm256_stream_load_si256() {
5326        let a = _mm256_set_epi64x(5, 6, 7, 8);
5327        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5328        assert_eq_m256i(a, r);
5329    }
5330
5331    #[simd_test(enable = "avx2")]
5332    unsafe fn test_mm256_sub_epi16() {
5333        let a = _mm256_set1_epi16(4);
5334        let b = _mm256_set1_epi16(2);
5335        let r = _mm256_sub_epi16(a, b);
5336        assert_eq_m256i(r, b);
5337    }
5338
5339    #[simd_test(enable = "avx2")]
5340    unsafe fn test_mm256_sub_epi32() {
5341        let a = _mm256_set1_epi32(4);
5342        let b = _mm256_set1_epi32(2);
5343        let r = _mm256_sub_epi32(a, b);
5344        assert_eq_m256i(r, b);
5345    }
5346
5347    #[simd_test(enable = "avx2")]
5348    unsafe fn test_mm256_sub_epi64() {
5349        let a = _mm256_set1_epi64x(4);
5350        let b = _mm256_set1_epi64x(2);
5351        let r = _mm256_sub_epi64(a, b);
5352        assert_eq_m256i(r, b);
5353    }
5354
5355    #[simd_test(enable = "avx2")]
5356    unsafe fn test_mm256_sub_epi8() {
5357        let a = _mm256_set1_epi8(4);
5358        let b = _mm256_set1_epi8(2);
5359        let r = _mm256_sub_epi8(a, b);
5360        assert_eq_m256i(r, b);
5361    }
5362
5363    #[simd_test(enable = "avx2")]
5364    unsafe fn test_mm256_subs_epi16() {
5365        let a = _mm256_set1_epi16(4);
5366        let b = _mm256_set1_epi16(2);
5367        let r = _mm256_subs_epi16(a, b);
5368        assert_eq_m256i(r, b);
5369    }
5370
5371    #[simd_test(enable = "avx2")]
5372    unsafe fn test_mm256_subs_epi8() {
5373        let a = _mm256_set1_epi8(4);
5374        let b = _mm256_set1_epi8(2);
5375        let r = _mm256_subs_epi8(a, b);
5376        assert_eq_m256i(r, b);
5377    }
5378
5379    #[simd_test(enable = "avx2")]
5380    unsafe fn test_mm256_subs_epu16() {
5381        let a = _mm256_set1_epi16(4);
5382        let b = _mm256_set1_epi16(2);
5383        let r = _mm256_subs_epu16(a, b);
5384        assert_eq_m256i(r, b);
5385    }
5386
5387    #[simd_test(enable = "avx2")]
5388    unsafe fn test_mm256_subs_epu8() {
5389        let a = _mm256_set1_epi8(4);
5390        let b = _mm256_set1_epi8(2);
5391        let r = _mm256_subs_epu8(a, b);
5392        assert_eq_m256i(r, b);
5393    }
5394
5395    #[simd_test(enable = "avx2")]
5396    unsafe fn test_mm256_xor_si256() {
5397        let a = _mm256_set1_epi8(5);
5398        let b = _mm256_set1_epi8(3);
5399        let r = _mm256_xor_si256(a, b);
5400        assert_eq_m256i(r, _mm256_set1_epi8(6));
5401    }
5402
5403    #[simd_test(enable = "avx2")]
5404    unsafe fn test_mm256_alignr_epi8() {
5405        #[rustfmt::skip]
5406        let a = _mm256_setr_epi8(
5407            1, 2, 3, 4, 5, 6, 7, 8,
5408            9, 10, 11, 12, 13, 14, 15, 16,
5409            17, 18, 19, 20, 21, 22, 23, 24,
5410            25, 26, 27, 28, 29, 30, 31, 32,
5411        );
5412        #[rustfmt::skip]
5413        let b = _mm256_setr_epi8(
5414            -1, -2, -3, -4, -5, -6, -7, -8,
5415            -9, -10, -11, -12, -13, -14, -15, -16,
5416            -17, -18, -19, -20, -21, -22, -23, -24,
5417            -25, -26, -27, -28, -29, -30, -31, -32,
5418        );
5419        let r = _mm256_alignr_epi8::<33>(a, b);
5420        assert_eq_m256i(r, _mm256_set1_epi8(0));
5421
5422        let r = _mm256_alignr_epi8::<17>(a, b);
5423        #[rustfmt::skip]
5424        let expected = _mm256_setr_epi8(
5425            2, 3, 4, 5, 6, 7, 8, 9,
5426            10, 11, 12, 13, 14, 15, 16, 0,
5427            18, 19, 20, 21, 22, 23, 24, 25,
5428            26, 27, 28, 29, 30, 31, 32, 0,
5429        );
5430        assert_eq_m256i(r, expected);
5431
5432        let r = _mm256_alignr_epi8::<4>(a, b);
5433        #[rustfmt::skip]
5434        let expected = _mm256_setr_epi8(
5435            -5, -6, -7, -8, -9, -10, -11, -12,
5436            -13, -14, -15, -16, 1, 2, 3, 4,
5437            -21, -22, -23, -24, -25, -26, -27, -28,
5438            -29, -30, -31, -32, 17, 18, 19, 20,
5439        );
5440        assert_eq_m256i(r, expected);
5441
5442        let r = _mm256_alignr_epi8::<15>(a, b);
5443        #[rustfmt::skip]
5444        let expected = _mm256_setr_epi8(
5445            -16, 1, 2, 3, 4, 5, 6, 7,
5446            8, 9, 10, 11, 12, 13, 14, 15,
5447            -32, 17, 18, 19, 20, 21, 22, 23,
5448            24, 25, 26, 27, 28, 29, 30, 31,
5449        );
5450        assert_eq_m256i(r, expected);
5451
5452        let r = _mm256_alignr_epi8::<0>(a, b);
5453        assert_eq_m256i(r, b);
5454
5455        let r = _mm256_alignr_epi8::<16>(a, b);
5456        assert_eq_m256i(r, a);
5457    }
5458
5459    #[simd_test(enable = "avx2")]
5460    unsafe fn test_mm256_shuffle_epi8() {
5461        #[rustfmt::skip]
5462        let a = _mm256_setr_epi8(
5463            1, 2, 3, 4, 5, 6, 7, 8,
5464            9, 10, 11, 12, 13, 14, 15, 16,
5465            17, 18, 19, 20, 21, 22, 23, 24,
5466            25, 26, 27, 28, 29, 30, 31, 32,
5467        );
5468        #[rustfmt::skip]
5469        let b = _mm256_setr_epi8(
5470            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5471            12, 5, 5, 10, 4, 1, 8, 0,
5472            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5473            12, 5, 5, 10, 4, 1, 8, 0,
5474        );
5475        #[rustfmt::skip]
5476        let expected = _mm256_setr_epi8(
5477            5, 0, 5, 4, 9, 13, 7, 4,
5478            13, 6, 6, 11, 5, 2, 9, 1,
5479            21, 0, 21, 20, 25, 29, 23, 20,
5480            29, 22, 22, 27, 21, 18, 25, 17,
5481        );
5482        let r = _mm256_shuffle_epi8(a, b);
5483        assert_eq_m256i(r, expected);
5484    }
5485
5486    #[simd_test(enable = "avx2")]
5487    unsafe fn test_mm256_permutevar8x32_epi32() {
5488        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5489        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5490        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5491        let r = _mm256_permutevar8x32_epi32(a, b);
5492        assert_eq_m256i(r, expected);
5493    }
5494
5495    #[simd_test(enable = "avx2")]
5496    unsafe fn test_mm256_permute4x64_epi64() {
5497        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5498        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5499        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5500        assert_eq_m256i(r, expected);
5501    }
5502
5503    #[simd_test(enable = "avx2")]
5504    unsafe fn test_mm256_permute2x128_si256() {
5505        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5506        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5507        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5508        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5509        assert_eq_m256i(r, e);
5510    }
5511
5512    #[simd_test(enable = "avx2")]
5513    unsafe fn test_mm256_permute4x64_pd() {
5514        let a = _mm256_setr_pd(1., 2., 3., 4.);
5515        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5516        let e = _mm256_setr_pd(4., 1., 2., 1.);
5517        assert_eq_m256d(r, e);
5518    }
5519
5520    #[simd_test(enable = "avx2")]
5521    unsafe fn test_mm256_permutevar8x32_ps() {
5522        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5523        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5524        let r = _mm256_permutevar8x32_ps(a, b);
5525        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5526        assert_eq_m256(r, e);
5527    }
5528
5529    #[simd_test(enable = "avx2")]
5530    unsafe fn test_mm_i32gather_epi32() {
5531        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5532        // A multiplier of 4 is word-addressing
5533        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5534        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5535    }
5536
5537    #[simd_test(enable = "avx2")]
5538    unsafe fn test_mm_mask_i32gather_epi32() {
5539        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5540        // A multiplier of 4 is word-addressing
5541        let r = _mm_mask_i32gather_epi32::<4>(
5542            _mm_set1_epi32(256),
5543            arr.as_ptr(),
5544            _mm_setr_epi32(0, 16, 64, 96),
5545            _mm_setr_epi32(-1, -1, -1, 0),
5546        );
5547        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5548    }
5549
5550    #[simd_test(enable = "avx2")]
5551    unsafe fn test_mm256_i32gather_epi32() {
5552        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5553        // A multiplier of 4 is word-addressing
5554        let r =
5555            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5556        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5557    }
5558
5559    #[simd_test(enable = "avx2")]
5560    unsafe fn test_mm256_mask_i32gather_epi32() {
5561        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5562        // A multiplier of 4 is word-addressing
5563        let r = _mm256_mask_i32gather_epi32::<4>(
5564            _mm256_set1_epi32(256),
5565            arr.as_ptr(),
5566            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5567            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5568        );
5569        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5570    }
5571
5572    #[simd_test(enable = "avx2")]
5573    unsafe fn test_mm_i32gather_ps() {
5574        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5575        // A multiplier of 4 is word-addressing for f32s
5576        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5577        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5578    }
5579
5580    #[simd_test(enable = "avx2")]
5581    unsafe fn test_mm_mask_i32gather_ps() {
5582        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5583        // A multiplier of 4 is word-addressing for f32s
5584        let r = _mm_mask_i32gather_ps::<4>(
5585            _mm_set1_ps(256.0),
5586            arr.as_ptr(),
5587            _mm_setr_epi32(0, 16, 64, 96),
5588            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5589        );
5590        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5591    }
5592
5593    #[simd_test(enable = "avx2")]
5594    unsafe fn test_mm256_i32gather_ps() {
5595        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5596        // A multiplier of 4 is word-addressing for f32s
5597        let r =
5598            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5599        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5600    }
5601
5602    #[simd_test(enable = "avx2")]
5603    unsafe fn test_mm256_mask_i32gather_ps() {
5604        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5605        // A multiplier of 4 is word-addressing for f32s
5606        let r = _mm256_mask_i32gather_ps::<4>(
5607            _mm256_set1_ps(256.0),
5608            arr.as_ptr(),
5609            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5610            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5611        );
5612        assert_eq_m256(
5613            r,
5614            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5615        );
5616    }
5617
5618    #[simd_test(enable = "avx2")]
5619    unsafe fn test_mm_i32gather_epi64() {
5620        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5621        // A multiplier of 8 is word-addressing for i64s
5622        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5623        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5624    }
5625
5626    #[simd_test(enable = "avx2")]
5627    unsafe fn test_mm_mask_i32gather_epi64() {
5628        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5629        // A multiplier of 8 is word-addressing for i64s
5630        let r = _mm_mask_i32gather_epi64::<8>(
5631            _mm_set1_epi64x(256),
5632            arr.as_ptr(),
5633            _mm_setr_epi32(16, 16, 16, 16),
5634            _mm_setr_epi64x(-1, 0),
5635        );
5636        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5637    }
5638
5639    #[simd_test(enable = "avx2")]
5640    unsafe fn test_mm256_i32gather_epi64() {
5641        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5642        // A multiplier of 8 is word-addressing for i64s
5643        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5644        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5645    }
5646
5647    #[simd_test(enable = "avx2")]
5648    unsafe fn test_mm256_mask_i32gather_epi64() {
5649        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5650        // A multiplier of 8 is word-addressing for i64s
5651        let r = _mm256_mask_i32gather_epi64::<8>(
5652            _mm256_set1_epi64x(256),
5653            arr.as_ptr(),
5654            _mm_setr_epi32(0, 16, 64, 96),
5655            _mm256_setr_epi64x(-1, -1, -1, 0),
5656        );
5657        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5658    }
5659
5660    #[simd_test(enable = "avx2")]
5661    unsafe fn test_mm_i32gather_pd() {
5662        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5663        // A multiplier of 8 is word-addressing for f64s
5664        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5665        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5666    }
5667
5668    #[simd_test(enable = "avx2")]
5669    unsafe fn test_mm_mask_i32gather_pd() {
5670        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5671        // A multiplier of 8 is word-addressing for f64s
5672        let r = _mm_mask_i32gather_pd::<8>(
5673            _mm_set1_pd(256.0),
5674            arr.as_ptr(),
5675            _mm_setr_epi32(16, 16, 16, 16),
5676            _mm_setr_pd(-1.0, 0.0),
5677        );
5678        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5679    }
5680
5681    #[simd_test(enable = "avx2")]
5682    unsafe fn test_mm256_i32gather_pd() {
5683        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5684        // A multiplier of 8 is word-addressing for f64s
5685        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5686        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5687    }
5688
5689    #[simd_test(enable = "avx2")]
5690    unsafe fn test_mm256_mask_i32gather_pd() {
5691        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5692        // A multiplier of 8 is word-addressing for f64s
5693        let r = _mm256_mask_i32gather_pd::<8>(
5694            _mm256_set1_pd(256.0),
5695            arr.as_ptr(),
5696            _mm_setr_epi32(0, 16, 64, 96),
5697            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5698        );
5699        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5700    }
5701
5702    #[simd_test(enable = "avx2")]
5703    unsafe fn test_mm_i64gather_epi32() {
5704        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5705        // A multiplier of 4 is word-addressing
5706        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5707        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5708    }
5709
5710    #[simd_test(enable = "avx2")]
5711    unsafe fn test_mm_mask_i64gather_epi32() {
5712        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5713        // A multiplier of 4 is word-addressing
5714        let r = _mm_mask_i64gather_epi32::<4>(
5715            _mm_set1_epi32(256),
5716            arr.as_ptr(),
5717            _mm_setr_epi64x(0, 16),
5718            _mm_setr_epi32(-1, 0, -1, 0),
5719        );
5720        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5721    }
5722
5723    #[simd_test(enable = "avx2")]
5724    unsafe fn test_mm256_i64gather_epi32() {
5725        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5726        // A multiplier of 4 is word-addressing
5727        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5728        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5729    }
5730
5731    #[simd_test(enable = "avx2")]
5732    unsafe fn test_mm256_mask_i64gather_epi32() {
5733        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5734        // A multiplier of 4 is word-addressing
5735        let r = _mm256_mask_i64gather_epi32::<4>(
5736            _mm_set1_epi32(256),
5737            arr.as_ptr(),
5738            _mm256_setr_epi64x(0, 16, 64, 96),
5739            _mm_setr_epi32(-1, -1, -1, 0),
5740        );
5741        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5742    }
5743
5744    #[simd_test(enable = "avx2")]
5745    unsafe fn test_mm_i64gather_ps() {
5746        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5747        // A multiplier of 4 is word-addressing for f32s
5748        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5749        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5750    }
5751
5752    #[simd_test(enable = "avx2")]
5753    unsafe fn test_mm_mask_i64gather_ps() {
5754        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5755        // A multiplier of 4 is word-addressing for f32s
5756        let r = _mm_mask_i64gather_ps::<4>(
5757            _mm_set1_ps(256.0),
5758            arr.as_ptr(),
5759            _mm_setr_epi64x(0, 16),
5760            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5761        );
5762        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5763    }
5764
5765    #[simd_test(enable = "avx2")]
5766    unsafe fn test_mm256_i64gather_ps() {
5767        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5768        // A multiplier of 4 is word-addressing for f32s
5769        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5770        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5771    }
5772
5773    #[simd_test(enable = "avx2")]
5774    unsafe fn test_mm256_mask_i64gather_ps() {
5775        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5776        // A multiplier of 4 is word-addressing for f32s
5777        let r = _mm256_mask_i64gather_ps::<4>(
5778            _mm_set1_ps(256.0),
5779            arr.as_ptr(),
5780            _mm256_setr_epi64x(0, 16, 64, 96),
5781            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5782        );
5783        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5784    }
5785
5786    #[simd_test(enable = "avx2")]
5787    unsafe fn test_mm_i64gather_epi64() {
5788        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5789        // A multiplier of 8 is word-addressing for i64s
5790        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5791        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5792    }
5793
5794    #[simd_test(enable = "avx2")]
5795    unsafe fn test_mm_mask_i64gather_epi64() {
5796        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5797        // A multiplier of 8 is word-addressing for i64s
5798        let r = _mm_mask_i64gather_epi64::<8>(
5799            _mm_set1_epi64x(256),
5800            arr.as_ptr(),
5801            _mm_setr_epi64x(16, 16),
5802            _mm_setr_epi64x(-1, 0),
5803        );
5804        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5805    }
5806
5807    #[simd_test(enable = "avx2")]
5808    unsafe fn test_mm256_i64gather_epi64() {
5809        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5810        // A multiplier of 8 is word-addressing for i64s
5811        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5812        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5813    }
5814
5815    #[simd_test(enable = "avx2")]
5816    unsafe fn test_mm256_mask_i64gather_epi64() {
5817        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5818        // A multiplier of 8 is word-addressing for i64s
5819        let r = _mm256_mask_i64gather_epi64::<8>(
5820            _mm256_set1_epi64x(256),
5821            arr.as_ptr(),
5822            _mm256_setr_epi64x(0, 16, 64, 96),
5823            _mm256_setr_epi64x(-1, -1, -1, 0),
5824        );
5825        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5826    }
5827
5828    #[simd_test(enable = "avx2")]
5829    unsafe fn test_mm_i64gather_pd() {
5830        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5831        // A multiplier of 8 is word-addressing for f64s
5832        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5833        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5834    }
5835
5836    #[simd_test(enable = "avx2")]
5837    unsafe fn test_mm_mask_i64gather_pd() {
5838        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5839        // A multiplier of 8 is word-addressing for f64s
5840        let r = _mm_mask_i64gather_pd::<8>(
5841            _mm_set1_pd(256.0),
5842            arr.as_ptr(),
5843            _mm_setr_epi64x(16, 16),
5844            _mm_setr_pd(-1.0, 0.0),
5845        );
5846        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5847    }
5848
5849    #[simd_test(enable = "avx2")]
5850    unsafe fn test_mm256_i64gather_pd() {
5851        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5852        // A multiplier of 8 is word-addressing for f64s
5853        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5854        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5855    }
5856
5857    #[simd_test(enable = "avx2")]
5858    unsafe fn test_mm256_mask_i64gather_pd() {
5859        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5860        // A multiplier of 8 is word-addressing for f64s
5861        let r = _mm256_mask_i64gather_pd::<8>(
5862            _mm256_set1_pd(256.0),
5863            arr.as_ptr(),
5864            _mm256_setr_epi64x(0, 16, 64, 96),
5865            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5866        );
5867        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5868    }
5869
5870    #[simd_test(enable = "avx")]
5871    unsafe fn test_mm256_extract_epi8() {
5872        #[rustfmt::skip]
5873        let a = _mm256_setr_epi8(
5874            -1, 1, 2, 3, 4, 5, 6, 7,
5875            8, 9, 10, 11, 12, 13, 14, 15,
5876            16, 17, 18, 19, 20, 21, 22, 23,
5877            24, 25, 26, 27, 28, 29, 30, 31
5878        );
5879        let r1 = _mm256_extract_epi8::<0>(a);
5880        let r2 = _mm256_extract_epi8::<3>(a);
5881        assert_eq!(r1, 0xFF);
5882        assert_eq!(r2, 3);
5883    }
5884
5885    #[simd_test(enable = "avx2")]
5886    unsafe fn test_mm256_extract_epi16() {
5887        #[rustfmt::skip]
5888        let a = _mm256_setr_epi16(
5889            -1, 1, 2, 3, 4, 5, 6, 7,
5890            8, 9, 10, 11, 12, 13, 14, 15,
5891        );
5892        let r1 = _mm256_extract_epi16::<0>(a);
5893        let r2 = _mm256_extract_epi16::<3>(a);
5894        assert_eq!(r1, 0xFFFF);
5895        assert_eq!(r2, 3);
5896    }
5897}