core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use core::hint::unreachable_unchecked;
22
23use crate::core_arch::{simd::*, x86::*};
24use crate::intrinsics::simd::*;
25
26#[cfg(test)]
27use stdarch_test::assert_instr;
28
29/// Computes the absolute values of packed 32-bit integers in `a`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
32#[inline]
33#[target_feature(enable = "avx2")]
34#[cfg_attr(test, assert_instr(vpabsd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37    unsafe {
38        let a = a.as_i32x8();
39        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
40        transmute(r)
41    }
42}
43
44/// Computes the absolute values of packed 16-bit integers in `a`.
45///
46/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
47#[inline]
48#[target_feature(enable = "avx2")]
49#[cfg_attr(test, assert_instr(vpabsw))]
50#[stable(feature = "simd_x86", since = "1.27.0")]
51pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
67    unsafe {
68        let a = a.as_i8x32();
69        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
70        transmute(r)
71    }
72}
73
74/// Adds packed 64-bit integers in `a` and `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
77#[inline]
78#[target_feature(enable = "avx2")]
79#[cfg_attr(test, assert_instr(vpaddq))]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
82    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
83}
84
85/// Adds packed 32-bit integers in `a` and `b`.
86///
87/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
88#[inline]
89#[target_feature(enable = "avx2")]
90#[cfg_attr(test, assert_instr(vpaddd))]
91#[stable(feature = "simd_x86", since = "1.27.0")]
92pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
93    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
94}
95
96/// Adds packed 16-bit integers in `a` and `b`.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
99#[inline]
100#[target_feature(enable = "avx2")]
101#[cfg_attr(test, assert_instr(vpaddw))]
102#[stable(feature = "simd_x86", since = "1.27.0")]
103pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
104    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
105}
106
107/// Adds packed 8-bit integers in `a` and `b`.
108///
109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
110#[inline]
111#[target_feature(enable = "avx2")]
112#[cfg_attr(test, assert_instr(vpaddb))]
113#[stable(feature = "simd_x86", since = "1.27.0")]
114pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
115    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
116}
117
118/// Adds packed 8-bit integers in `a` and `b` using saturation.
119///
120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
121#[inline]
122#[target_feature(enable = "avx2")]
123#[cfg_attr(test, assert_instr(vpaddsb))]
124#[stable(feature = "simd_x86", since = "1.27.0")]
125pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
126    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
127}
128
129/// Adds packed 16-bit integers in `a` and `b` using saturation.
130///
131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
132#[inline]
133#[target_feature(enable = "avx2")]
134#[cfg_attr(test, assert_instr(vpaddsw))]
135#[stable(feature = "simd_x86", since = "1.27.0")]
136pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
137    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
138}
139
140/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
141///
142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
143#[inline]
144#[target_feature(enable = "avx2")]
145#[cfg_attr(test, assert_instr(vpaddusb))]
146#[stable(feature = "simd_x86", since = "1.27.0")]
147pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
148    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
149}
150
151/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
152///
153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
154#[inline]
155#[target_feature(enable = "avx2")]
156#[cfg_attr(test, assert_instr(vpaddusw))]
157#[stable(feature = "simd_x86", since = "1.27.0")]
158pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
159    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
160}
161
162/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
163/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
164///
165/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
166#[inline]
167#[target_feature(enable = "avx2")]
168#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
169#[rustc_legacy_const_generics(2)]
170#[stable(feature = "simd_x86", since = "1.27.0")]
171pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
172    static_assert_uimm_bits!(IMM8, 8);
173    unsafe {
174        // If palignr is shifting the pair of vectors more than the size of two
175        // lanes, emit zero.
176        if IMM8 >= 32 {
177            return _mm256_setzero_si256();
178        }
179        // If palignr is shifting the pair of input vectors more than one lane,
180        // but less than two lanes, convert to shifting in zeroes.
181        let (a, b) = if IMM8 > 16 {
182            (_mm256_setzero_si256(), a)
183        } else {
184            (a, b)
185        };
186
187        let a = a.as_i8x32();
188        let b = b.as_i8x32();
189
190        if IMM8 == 16 {
191            return transmute(a);
192        }
193
194        let r: i8x32 = match IMM8 % 16 {
195            0 => simd_shuffle!(
196                b,
197                a,
198                [
199                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
200                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
201                ],
202            ),
203            1 => simd_shuffle!(
204                b,
205                a,
206                [
207                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
208                    23, 24, 25, 26, 27, 28, 29, 30, 31, 48,
209                ],
210            ),
211            2 => simd_shuffle!(
212                b,
213                a,
214                [
215                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23,
216                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49,
217                ],
218            ),
219            3 => simd_shuffle!(
220                b,
221                a,
222                [
223                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23,
224                    24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
225                ],
226            ),
227            4 => simd_shuffle!(
228                b,
229                a,
230                [
231                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24,
232                    25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
233                ],
234            ),
235            5 => simd_shuffle!(
236                b,
237                a,
238                [
239                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25,
240                    26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
241                ],
242            ),
243            6 => simd_shuffle!(
244                b,
245                a,
246                [
247                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26,
248                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
249                ],
250            ),
251            7 => simd_shuffle!(
252                b,
253                a,
254                [
255                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26,
256                    27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
257                ],
258            ),
259            8 => simd_shuffle!(
260                b,
261                a,
262                [
263                    8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27,
264                    28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
265                ],
266            ),
267            9 => simd_shuffle!(
268                b,
269                a,
270                [
271                    9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28,
272                    29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
273                ],
274            ),
275            10 => simd_shuffle!(
276                b,
277                a,
278                [
279                    10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29,
280                    30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
281                ],
282            ),
283            11 => simd_shuffle!(
284                b,
285                a,
286                [
287                    11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30,
288                    31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
289                ],
290            ),
291            12 => simd_shuffle!(
292                b,
293                a,
294                [
295                    12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31,
296                    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
297                ],
298            ),
299            13 => simd_shuffle!(
300                b,
301                a,
302                [
303                    13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48,
304                    49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
305                ],
306            ),
307            14 => simd_shuffle!(
308                b,
309                a,
310                [
311                    14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49,
312                    50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
313                ],
314            ),
315            15 => simd_shuffle!(
316                b,
317                a,
318                [
319                    15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50,
320                    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
321                ],
322            ),
323            _ => unreachable_unchecked(),
324        };
325        transmute(r)
326    }
327}
328
329/// Computes the bitwise AND of 256 bits (representing integer data)
330/// in `a` and `b`.
331///
332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
333#[inline]
334#[target_feature(enable = "avx2")]
335#[cfg_attr(test, assert_instr(vandps))]
336#[stable(feature = "simd_x86", since = "1.27.0")]
337pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
338    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
339}
340
341/// Computes the bitwise NOT of 256 bits (representing integer data)
342/// in `a` and then AND with `b`.
343///
344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
345#[inline]
346#[target_feature(enable = "avx2")]
347#[cfg_attr(test, assert_instr(vandnps))]
348#[stable(feature = "simd_x86", since = "1.27.0")]
349pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
350    unsafe {
351        let all_ones = _mm256_set1_epi8(-1);
352        transmute(simd_and(
353            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
354            b.as_i64x4(),
355        ))
356    }
357}
358
359/// Averages packed unsigned 16-bit integers in `a` and `b`.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
362#[inline]
363#[target_feature(enable = "avx2")]
364#[cfg_attr(test, assert_instr(vpavgw))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
367    unsafe {
368        let a = simd_cast::<_, u32x16>(a.as_u16x16());
369        let b = simd_cast::<_, u32x16>(b.as_u16x16());
370        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
371        transmute(simd_cast::<_, u16x16>(r))
372    }
373}
374
375/// Averages packed unsigned 8-bit integers in `a` and `b`.
376///
377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
378#[inline]
379#[target_feature(enable = "avx2")]
380#[cfg_attr(test, assert_instr(vpavgb))]
381#[stable(feature = "simd_x86", since = "1.27.0")]
382pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
383    unsafe {
384        let a = simd_cast::<_, u16x32>(a.as_u8x32());
385        let b = simd_cast::<_, u16x32>(b.as_u8x32());
386        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
387        transmute(simd_cast::<_, u8x32>(r))
388    }
389}
390
391/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
394#[inline]
395#[target_feature(enable = "avx2")]
396#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
397#[rustc_legacy_const_generics(2)]
398#[stable(feature = "simd_x86", since = "1.27.0")]
399pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
400    static_assert_uimm_bits!(IMM4, 4);
401    unsafe {
402        let a = a.as_i32x4();
403        let b = b.as_i32x4();
404        let r: i32x4 = simd_shuffle!(
405            a,
406            b,
407            [
408                [0, 4, 0, 4][IMM4 as usize & 0b11],
409                [1, 1, 5, 5][IMM4 as usize & 0b11],
410                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
411                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
412            ],
413        );
414        transmute(r)
415    }
416}
417
418/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
419///
420/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
421#[inline]
422#[target_feature(enable = "avx2")]
423#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
424#[rustc_legacy_const_generics(2)]
425#[stable(feature = "simd_x86", since = "1.27.0")]
426pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
427    static_assert_uimm_bits!(IMM8, 8);
428    unsafe {
429        let a = a.as_i32x8();
430        let b = b.as_i32x8();
431        let r: i32x8 = simd_shuffle!(
432            a,
433            b,
434            [
435                [0, 8, 0, 8][IMM8 as usize & 0b11],
436                [1, 1, 9, 9][IMM8 as usize & 0b11],
437                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
438                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
439                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
440                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
441                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
442                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
443            ],
444        );
445        transmute(r)
446    }
447}
448
449/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
455#[rustc_legacy_const_generics(2)]
456#[stable(feature = "simd_x86", since = "1.27.0")]
457pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
458    static_assert_uimm_bits!(IMM8, 8);
459    unsafe {
460        let a = a.as_i16x16();
461        let b = b.as_i16x16();
462
463        let r: i16x16 = simd_shuffle!(
464            a,
465            b,
466            [
467                [0, 16, 0, 16][IMM8 as usize & 0b11],
468                [1, 1, 17, 17][IMM8 as usize & 0b11],
469                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
470                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
471                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
472                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
473                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
474                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
475                [8, 24, 8, 24][IMM8 as usize & 0b11],
476                [9, 9, 25, 25][IMM8 as usize & 0b11],
477                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
478                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
479                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
480                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
481                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
482                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
483            ],
484        );
485        transmute(r)
486    }
487}
488
489/// Blends packed 8-bit integers from `a` and `b` using `mask`.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vpblendvb))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
497    unsafe {
498        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
499        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
500    }
501}
502
503/// Broadcasts the low packed 8-bit integer from `a` to all elements of
504/// the 128-bit returned value.
505///
506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
507#[inline]
508#[target_feature(enable = "avx2")]
509#[cfg_attr(test, assert_instr(vpbroadcastb))]
510#[stable(feature = "simd_x86", since = "1.27.0")]
511pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
512    unsafe {
513        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
514        transmute::<i8x16, _>(ret)
515    }
516}
517
518/// Broadcasts the low packed 8-bit integer from `a` to all elements of
519/// the 256-bit returned value.
520///
521/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
522#[inline]
523#[target_feature(enable = "avx2")]
524#[cfg_attr(test, assert_instr(vpbroadcastb))]
525#[stable(feature = "simd_x86", since = "1.27.0")]
526pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
527    unsafe {
528        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
529        transmute::<i8x32, _>(ret)
530    }
531}
532
533// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
534// often compiled to `vbroadcastss`.
535/// Broadcasts the low packed 32-bit integer from `a` to all elements of
536/// the 128-bit returned value.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
539#[inline]
540#[target_feature(enable = "avx2")]
541#[cfg_attr(test, assert_instr(vbroadcastss))]
542#[stable(feature = "simd_x86", since = "1.27.0")]
543pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
546        transmute::<i32x4, _>(ret)
547    }
548}
549
550// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
551// often compiled to `vbroadcastss`.
552/// Broadcasts the low packed 32-bit integer from `a` to all elements of
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[cfg_attr(test, assert_instr(vbroadcastss))]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
561    unsafe {
562        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
563        transmute::<i32x8, _>(ret)
564    }
565}
566
567/// Broadcasts the low packed 64-bit integer from `a` to all elements of
568/// the 128-bit returned value.
569///
570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
571#[inline]
572#[target_feature(enable = "avx2")]
573// Emits `vmovddup` instead of `vpbroadcastq`
574// See https://github.com/rust-lang/stdarch/issues/791
575#[cfg_attr(test, assert_instr(vmovddup))]
576#[stable(feature = "simd_x86", since = "1.27.0")]
577pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
578    unsafe {
579        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
580        transmute::<i64x2, _>(ret)
581    }
582}
583
584/// Broadcasts the low packed 64-bit integer from `a` to all elements of
585/// the 256-bit returned value.
586///
587/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
588#[inline]
589#[target_feature(enable = "avx2")]
590#[cfg_attr(test, assert_instr(vbroadcastsd))]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
593    unsafe {
594        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
595        transmute::<i64x4, _>(ret)
596    }
597}
598
599/// Broadcasts the low double-precision (64-bit) floating-point element
600/// from `a` to all elements of the 128-bit returned value.
601///
602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
603#[inline]
604#[target_feature(enable = "avx2")]
605#[cfg_attr(test, assert_instr(vmovddup))]
606#[stable(feature = "simd_x86", since = "1.27.0")]
607pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
608    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
609}
610
611/// Broadcasts the low double-precision (64-bit) floating-point element
612/// from `a` to all elements of the 256-bit returned value.
613///
614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
615#[inline]
616#[target_feature(enable = "avx2")]
617#[cfg_attr(test, assert_instr(vbroadcastsd))]
618#[stable(feature = "simd_x86", since = "1.27.0")]
619pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
620    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
621}
622
623/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
624/// the 256-bit returned value.
625///
626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
627#[inline]
628#[target_feature(enable = "avx2")]
629#[stable(feature = "simd_x86_updates", since = "1.82.0")]
630pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
631    unsafe {
632        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
633        transmute::<i64x4, _>(ret)
634    }
635}
636
637// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
638// `vbroadcastf128`.
639/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
640/// the 256-bit returned value.
641///
642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
643#[inline]
644#[target_feature(enable = "avx2")]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
647    unsafe {
648        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
649        transmute::<i64x4, _>(ret)
650    }
651}
652
653/// Broadcasts the low single-precision (32-bit) floating-point element
654/// from `a` to all elements of the 128-bit returned value.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vbroadcastss))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
662    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
663}
664
665/// Broadcasts the low single-precision (32-bit) floating-point element
666/// from `a` to all elements of the 256-bit returned value.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vbroadcastss))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
674    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
675}
676
677/// Broadcasts the low packed 16-bit integer from a to all elements of
678/// the 128-bit returned value
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpbroadcastw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
686    unsafe {
687        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
688        transmute::<i16x8, _>(ret)
689    }
690}
691
692/// Broadcasts the low packed 16-bit integer from a to all elements of
693/// the 256-bit returned value
694///
695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
696#[inline]
697#[target_feature(enable = "avx2")]
698#[cfg_attr(test, assert_instr(vpbroadcastw))]
699#[stable(feature = "simd_x86", since = "1.27.0")]
700pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
701    unsafe {
702        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
703        transmute::<i16x16, _>(ret)
704    }
705}
706
707/// Compares packed 64-bit integers in `a` and `b` for equality.
708///
709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
710#[inline]
711#[target_feature(enable = "avx2")]
712#[cfg_attr(test, assert_instr(vpcmpeqq))]
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
715    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
716}
717
718/// Compares packed 32-bit integers in `a` and `b` for equality.
719///
720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
721#[inline]
722#[target_feature(enable = "avx2")]
723#[cfg_attr(test, assert_instr(vpcmpeqd))]
724#[stable(feature = "simd_x86", since = "1.27.0")]
725pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
726    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
727}
728
729/// Compares packed 16-bit integers in `a` and `b` for equality.
730///
731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
732#[inline]
733#[target_feature(enable = "avx2")]
734#[cfg_attr(test, assert_instr(vpcmpeqw))]
735#[stable(feature = "simd_x86", since = "1.27.0")]
736pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
737    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
738}
739
740/// Compares packed 8-bit integers in `a` and `b` for equality.
741///
742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
743#[inline]
744#[target_feature(enable = "avx2")]
745#[cfg_attr(test, assert_instr(vpcmpeqb))]
746#[stable(feature = "simd_x86", since = "1.27.0")]
747pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
748    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
749}
750
751/// Compares packed 64-bit integers in `a` and `b` for greater-than.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
754#[inline]
755#[target_feature(enable = "avx2")]
756#[cfg_attr(test, assert_instr(vpcmpgtq))]
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
759    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
760}
761
762/// Compares packed 32-bit integers in `a` and `b` for greater-than.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpcmpgtd))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
770    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
771}
772
773/// Compares packed 16-bit integers in `a` and `b` for greater-than.
774///
775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
776#[inline]
777#[target_feature(enable = "avx2")]
778#[cfg_attr(test, assert_instr(vpcmpgtw))]
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
781    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
782}
783
784/// Compares packed 8-bit integers in `a` and `b` for greater-than.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
787#[inline]
788#[target_feature(enable = "avx2")]
789#[cfg_attr(test, assert_instr(vpcmpgtb))]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
792    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
793}
794
795/// Sign-extend 16-bit integers to 32-bit integers.
796///
797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
798#[inline]
799#[target_feature(enable = "avx2")]
800#[cfg_attr(test, assert_instr(vpmovsxwd))]
801#[stable(feature = "simd_x86", since = "1.27.0")]
802pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
803    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
804}
805
806/// Sign-extend 16-bit integers to 64-bit integers.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
809#[inline]
810#[target_feature(enable = "avx2")]
811#[cfg_attr(test, assert_instr(vpmovsxwq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
814    unsafe {
815        let a = a.as_i16x8();
816        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
817        transmute::<i64x4, _>(simd_cast(v64))
818    }
819}
820
821/// Sign-extend 32-bit integers to 64-bit integers.
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
824#[inline]
825#[target_feature(enable = "avx2")]
826#[cfg_attr(test, assert_instr(vpmovsxdq))]
827#[stable(feature = "simd_x86", since = "1.27.0")]
828pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
829    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
830}
831
832/// Sign-extend 8-bit integers to 16-bit integers.
833///
834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
835#[inline]
836#[target_feature(enable = "avx2")]
837#[cfg_attr(test, assert_instr(vpmovsxbw))]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
840    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
841}
842
843/// Sign-extend 8-bit integers to 32-bit integers.
844///
845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
846#[inline]
847#[target_feature(enable = "avx2")]
848#[cfg_attr(test, assert_instr(vpmovsxbd))]
849#[stable(feature = "simd_x86", since = "1.27.0")]
850pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
851    unsafe {
852        let a = a.as_i8x16();
853        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
854        transmute::<i32x8, _>(simd_cast(v64))
855    }
856}
857
858/// Sign-extend 8-bit integers to 64-bit integers.
859///
860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
861#[inline]
862#[target_feature(enable = "avx2")]
863#[cfg_attr(test, assert_instr(vpmovsxbq))]
864#[stable(feature = "simd_x86", since = "1.27.0")]
865pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
866    unsafe {
867        let a = a.as_i8x16();
868        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
869        transmute::<i64x4, _>(simd_cast(v32))
870    }
871}
872
873/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
874/// integers, and stores the results in `dst`.
875///
876/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
877#[inline]
878#[target_feature(enable = "avx2")]
879#[cfg_attr(test, assert_instr(vpmovzxwd))]
880#[stable(feature = "simd_x86", since = "1.27.0")]
881pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
882    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
883}
884
885/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
886/// integers. The upper four elements of `a` are unused.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vpmovzxwq))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
894    unsafe {
895        let a = a.as_u16x8();
896        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
897        transmute::<i64x4, _>(simd_cast(v64))
898    }
899}
900
901/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
904#[inline]
905#[target_feature(enable = "avx2")]
906#[cfg_attr(test, assert_instr(vpmovzxdq))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
909    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
910}
911
912/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
913///
914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
915#[inline]
916#[target_feature(enable = "avx2")]
917#[cfg_attr(test, assert_instr(vpmovzxbw))]
918#[stable(feature = "simd_x86", since = "1.27.0")]
919pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
920    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
921}
922
923/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
924/// integers. The upper eight elements of `a` are unused.
925///
926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
927#[inline]
928#[target_feature(enable = "avx2")]
929#[cfg_attr(test, assert_instr(vpmovzxbd))]
930#[stable(feature = "simd_x86", since = "1.27.0")]
931pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
932    unsafe {
933        let a = a.as_u8x16();
934        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
935        transmute::<i32x8, _>(simd_cast(v64))
936    }
937}
938
939/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
940/// integers. The upper twelve elements of `a` are unused.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
943#[inline]
944#[target_feature(enable = "avx2")]
945#[cfg_attr(test, assert_instr(vpmovzxbq))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
948    unsafe {
949        let a = a.as_u8x16();
950        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
951        transmute::<i64x4, _>(simd_cast(v32))
952    }
953}
954
955/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
956///
957/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
958#[inline]
959#[target_feature(enable = "avx2")]
960#[cfg_attr(
961    all(test, not(target_env = "msvc")),
962    assert_instr(vextractf128, IMM1 = 1)
963)]
964#[rustc_legacy_const_generics(1)]
965#[stable(feature = "simd_x86", since = "1.27.0")]
966pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
967    static_assert_uimm_bits!(IMM1, 1);
968    unsafe {
969        let a = a.as_i64x4();
970        let b = i64x4::ZERO;
971        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
972        transmute(dst)
973    }
974}
975
976/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
977///
978/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
979#[inline]
980#[target_feature(enable = "avx2")]
981#[cfg_attr(test, assert_instr(vphaddw))]
982#[stable(feature = "simd_x86", since = "1.27.0")]
983pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
984    unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
985}
986
987/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
988///
989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
990#[inline]
991#[target_feature(enable = "avx2")]
992#[cfg_attr(test, assert_instr(vphaddd))]
993#[stable(feature = "simd_x86", since = "1.27.0")]
994pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
995    unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
996}
997
998/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
999/// using saturation.
1000///
1001/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
1002#[inline]
1003#[target_feature(enable = "avx2")]
1004#[cfg_attr(test, assert_instr(vphaddsw))]
1005#[stable(feature = "simd_x86", since = "1.27.0")]
1006pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1007    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
1008}
1009
1010/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1011///
1012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1013#[inline]
1014#[target_feature(enable = "avx2")]
1015#[cfg_attr(test, assert_instr(vphsubw))]
1016#[stable(feature = "simd_x86", since = "1.27.0")]
1017pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1018    unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
1019}
1020
1021/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1022///
1023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1024#[inline]
1025#[target_feature(enable = "avx2")]
1026#[cfg_attr(test, assert_instr(vphsubd))]
1027#[stable(feature = "simd_x86", since = "1.27.0")]
1028pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1029    unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
1030}
1031
1032/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1033/// using saturation.
1034///
1035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1036#[inline]
1037#[target_feature(enable = "avx2")]
1038#[cfg_attr(test, assert_instr(vphsubsw))]
1039#[stable(feature = "simd_x86", since = "1.27.0")]
1040pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1041    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1042}
1043
1044/// Returns values from `slice` at offsets determined by `offsets * scale`,
1045/// where
1046/// `scale` should be 1, 2, 4 or 8.
1047///
1048/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1049#[inline]
1050#[target_feature(enable = "avx2")]
1051#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1052#[rustc_legacy_const_generics(2)]
1053#[stable(feature = "simd_x86", since = "1.27.0")]
1054pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1055    slice: *const i32,
1056    offsets: __m128i,
1057) -> __m128i {
1058    static_assert_imm8_scale!(SCALE);
1059    let zero = i32x4::ZERO;
1060    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1061    let offsets = offsets.as_i32x4();
1062    let slice = slice as *const i8;
1063    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1064    transmute(r)
1065}
1066
1067/// Returns values from `slice` at offsets determined by `offsets * scale`,
1068/// where
1069/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1070/// that position instead.
1071///
1072/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1073#[inline]
1074#[target_feature(enable = "avx2")]
1075#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1076#[rustc_legacy_const_generics(4)]
1077#[stable(feature = "simd_x86", since = "1.27.0")]
1078pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1079    src: __m128i,
1080    slice: *const i32,
1081    offsets: __m128i,
1082    mask: __m128i,
1083) -> __m128i {
1084    static_assert_imm8_scale!(SCALE);
1085    let src = src.as_i32x4();
1086    let mask = mask.as_i32x4();
1087    let offsets = offsets.as_i32x4();
1088    let slice = slice as *const i8;
1089    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1090    transmute(r)
1091}
1092
1093/// Returns values from `slice` at offsets determined by `offsets * scale`,
1094/// where
1095/// `scale` should be 1, 2, 4 or 8.
1096///
1097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1098#[inline]
1099#[target_feature(enable = "avx2")]
1100#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1101#[rustc_legacy_const_generics(2)]
1102#[stable(feature = "simd_x86", since = "1.27.0")]
1103pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1104    slice: *const i32,
1105    offsets: __m256i,
1106) -> __m256i {
1107    static_assert_imm8_scale!(SCALE);
1108    let zero = i32x8::ZERO;
1109    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1110    let offsets = offsets.as_i32x8();
1111    let slice = slice as *const i8;
1112    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1113    transmute(r)
1114}
1115
1116/// Returns values from `slice` at offsets determined by `offsets * scale`,
1117/// where
1118/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1119/// that position instead.
1120///
1121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1122#[inline]
1123#[target_feature(enable = "avx2")]
1124#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1125#[rustc_legacy_const_generics(4)]
1126#[stable(feature = "simd_x86", since = "1.27.0")]
1127pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1128    src: __m256i,
1129    slice: *const i32,
1130    offsets: __m256i,
1131    mask: __m256i,
1132) -> __m256i {
1133    static_assert_imm8_scale!(SCALE);
1134    let src = src.as_i32x8();
1135    let mask = mask.as_i32x8();
1136    let offsets = offsets.as_i32x8();
1137    let slice = slice as *const i8;
1138    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1139    transmute(r)
1140}
1141
1142/// Returns values from `slice` at offsets determined by `offsets * scale`,
1143/// where
1144/// `scale` should be 1, 2, 4 or 8.
1145///
1146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1147#[inline]
1148#[target_feature(enable = "avx2")]
1149#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1150#[rustc_legacy_const_generics(2)]
1151#[stable(feature = "simd_x86", since = "1.27.0")]
1152pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1153    static_assert_imm8_scale!(SCALE);
1154    let zero = _mm_setzero_ps();
1155    let neg_one = _mm_set1_ps(-1.0);
1156    let offsets = offsets.as_i32x4();
1157    let slice = slice as *const i8;
1158    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1159}
1160
1161/// Returns values from `slice` at offsets determined by `offsets * scale`,
1162/// where
1163/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1164/// that position instead.
1165///
1166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1167#[inline]
1168#[target_feature(enable = "avx2")]
1169#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1170#[rustc_legacy_const_generics(4)]
1171#[stable(feature = "simd_x86", since = "1.27.0")]
1172pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1173    src: __m128,
1174    slice: *const f32,
1175    offsets: __m128i,
1176    mask: __m128,
1177) -> __m128 {
1178    static_assert_imm8_scale!(SCALE);
1179    let offsets = offsets.as_i32x4();
1180    let slice = slice as *const i8;
1181    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1182}
1183
1184/// Returns values from `slice` at offsets determined by `offsets * scale`,
1185/// where
1186/// `scale` should be 1, 2, 4 or 8.
1187///
1188/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1189#[inline]
1190#[target_feature(enable = "avx2")]
1191#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1192#[rustc_legacy_const_generics(2)]
1193#[stable(feature = "simd_x86", since = "1.27.0")]
1194pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1195    static_assert_imm8_scale!(SCALE);
1196    let zero = _mm256_setzero_ps();
1197    let neg_one = _mm256_set1_ps(-1.0);
1198    let offsets = offsets.as_i32x8();
1199    let slice = slice as *const i8;
1200    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1201}
1202
1203/// Returns values from `slice` at offsets determined by `offsets * scale`,
1204/// where
1205/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1206/// that position instead.
1207///
1208/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1209#[inline]
1210#[target_feature(enable = "avx2")]
1211#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1212#[rustc_legacy_const_generics(4)]
1213#[stable(feature = "simd_x86", since = "1.27.0")]
1214pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1215    src: __m256,
1216    slice: *const f32,
1217    offsets: __m256i,
1218    mask: __m256,
1219) -> __m256 {
1220    static_assert_imm8_scale!(SCALE);
1221    let offsets = offsets.as_i32x8();
1222    let slice = slice as *const i8;
1223    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1224}
1225
1226/// Returns values from `slice` at offsets determined by `offsets * scale`,
1227/// where
1228/// `scale` should be 1, 2, 4 or 8.
1229///
1230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1231#[inline]
1232#[target_feature(enable = "avx2")]
1233#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1234#[rustc_legacy_const_generics(2)]
1235#[stable(feature = "simd_x86", since = "1.27.0")]
1236pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1237    slice: *const i64,
1238    offsets: __m128i,
1239) -> __m128i {
1240    static_assert_imm8_scale!(SCALE);
1241    let zero = i64x2::ZERO;
1242    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1243    let offsets = offsets.as_i32x4();
1244    let slice = slice as *const i8;
1245    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1246    transmute(r)
1247}
1248
1249/// Returns values from `slice` at offsets determined by `offsets * scale`,
1250/// where
1251/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1252/// that position instead.
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1255#[inline]
1256#[target_feature(enable = "avx2")]
1257#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1258#[rustc_legacy_const_generics(4)]
1259#[stable(feature = "simd_x86", since = "1.27.0")]
1260pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1261    src: __m128i,
1262    slice: *const i64,
1263    offsets: __m128i,
1264    mask: __m128i,
1265) -> __m128i {
1266    static_assert_imm8_scale!(SCALE);
1267    let src = src.as_i64x2();
1268    let mask = mask.as_i64x2();
1269    let offsets = offsets.as_i32x4();
1270    let slice = slice as *const i8;
1271    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1272    transmute(r)
1273}
1274
1275/// Returns values from `slice` at offsets determined by `offsets * scale`,
1276/// where
1277/// `scale` should be 1, 2, 4 or 8.
1278///
1279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1280#[inline]
1281#[target_feature(enable = "avx2")]
1282#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1283#[rustc_legacy_const_generics(2)]
1284#[stable(feature = "simd_x86", since = "1.27.0")]
1285pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1286    slice: *const i64,
1287    offsets: __m128i,
1288) -> __m256i {
1289    static_assert_imm8_scale!(SCALE);
1290    let zero = i64x4::ZERO;
1291    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1292    let offsets = offsets.as_i32x4();
1293    let slice = slice as *const i8;
1294    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1295    transmute(r)
1296}
1297
1298/// Returns values from `slice` at offsets determined by `offsets * scale`,
1299/// where
1300/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1301/// that position instead.
1302///
1303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1304#[inline]
1305#[target_feature(enable = "avx2")]
1306#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1307#[rustc_legacy_const_generics(4)]
1308#[stable(feature = "simd_x86", since = "1.27.0")]
1309pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1310    src: __m256i,
1311    slice: *const i64,
1312    offsets: __m128i,
1313    mask: __m256i,
1314) -> __m256i {
1315    static_assert_imm8_scale!(SCALE);
1316    let src = src.as_i64x4();
1317    let mask = mask.as_i64x4();
1318    let offsets = offsets.as_i32x4();
1319    let slice = slice as *const i8;
1320    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1321    transmute(r)
1322}
1323
1324/// Returns values from `slice` at offsets determined by `offsets * scale`,
1325/// where
1326/// `scale` should be 1, 2, 4 or 8.
1327///
1328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1329#[inline]
1330#[target_feature(enable = "avx2")]
1331#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1332#[rustc_legacy_const_generics(2)]
1333#[stable(feature = "simd_x86", since = "1.27.0")]
1334pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1335    static_assert_imm8_scale!(SCALE);
1336    let zero = _mm_setzero_pd();
1337    let neg_one = _mm_set1_pd(-1.0);
1338    let offsets = offsets.as_i32x4();
1339    let slice = slice as *const i8;
1340    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1341}
1342
1343/// Returns values from `slice` at offsets determined by `offsets * scale`,
1344/// where
1345/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1346/// that position instead.
1347///
1348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1349#[inline]
1350#[target_feature(enable = "avx2")]
1351#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1352#[rustc_legacy_const_generics(4)]
1353#[stable(feature = "simd_x86", since = "1.27.0")]
1354pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1355    src: __m128d,
1356    slice: *const f64,
1357    offsets: __m128i,
1358    mask: __m128d,
1359) -> __m128d {
1360    static_assert_imm8_scale!(SCALE);
1361    let offsets = offsets.as_i32x4();
1362    let slice = slice as *const i8;
1363    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1364}
1365
1366/// Returns values from `slice` at offsets determined by `offsets * scale`,
1367/// where
1368/// `scale` should be 1, 2, 4 or 8.
1369///
1370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1371#[inline]
1372#[target_feature(enable = "avx2")]
1373#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1374#[rustc_legacy_const_generics(2)]
1375#[stable(feature = "simd_x86", since = "1.27.0")]
1376pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1377    slice: *const f64,
1378    offsets: __m128i,
1379) -> __m256d {
1380    static_assert_imm8_scale!(SCALE);
1381    let zero = _mm256_setzero_pd();
1382    let neg_one = _mm256_set1_pd(-1.0);
1383    let offsets = offsets.as_i32x4();
1384    let slice = slice as *const i8;
1385    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1386}
1387
1388/// Returns values from `slice` at offsets determined by `offsets * scale`,
1389/// where
1390/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1391/// that position instead.
1392///
1393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1394#[inline]
1395#[target_feature(enable = "avx2")]
1396#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1397#[rustc_legacy_const_generics(4)]
1398#[stable(feature = "simd_x86", since = "1.27.0")]
1399pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1400    src: __m256d,
1401    slice: *const f64,
1402    offsets: __m128i,
1403    mask: __m256d,
1404) -> __m256d {
1405    static_assert_imm8_scale!(SCALE);
1406    let offsets = offsets.as_i32x4();
1407    let slice = slice as *const i8;
1408    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1409}
1410
1411/// Returns values from `slice` at offsets determined by `offsets * scale`,
1412/// where
1413/// `scale` should be 1, 2, 4 or 8.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1416#[inline]
1417#[target_feature(enable = "avx2")]
1418#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1419#[rustc_legacy_const_generics(2)]
1420#[stable(feature = "simd_x86", since = "1.27.0")]
1421pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1422    slice: *const i32,
1423    offsets: __m128i,
1424) -> __m128i {
1425    static_assert_imm8_scale!(SCALE);
1426    let zero = i32x4::ZERO;
1427    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1428    let offsets = offsets.as_i64x2();
1429    let slice = slice as *const i8;
1430    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1431    transmute(r)
1432}
1433
1434/// Returns values from `slice` at offsets determined by `offsets * scale`,
1435/// where
1436/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1437/// that position instead.
1438///
1439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1440#[inline]
1441#[target_feature(enable = "avx2")]
1442#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1443#[rustc_legacy_const_generics(4)]
1444#[stable(feature = "simd_x86", since = "1.27.0")]
1445pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1446    src: __m128i,
1447    slice: *const i32,
1448    offsets: __m128i,
1449    mask: __m128i,
1450) -> __m128i {
1451    static_assert_imm8_scale!(SCALE);
1452    let src = src.as_i32x4();
1453    let mask = mask.as_i32x4();
1454    let offsets = offsets.as_i64x2();
1455    let slice = slice as *const i8;
1456    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1457    transmute(r)
1458}
1459
1460/// Returns values from `slice` at offsets determined by `offsets * scale`,
1461/// where
1462/// `scale` should be 1, 2, 4 or 8.
1463///
1464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1465#[inline]
1466#[target_feature(enable = "avx2")]
1467#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1468#[rustc_legacy_const_generics(2)]
1469#[stable(feature = "simd_x86", since = "1.27.0")]
1470pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1471    slice: *const i32,
1472    offsets: __m256i,
1473) -> __m128i {
1474    static_assert_imm8_scale!(SCALE);
1475    let zero = i32x4::ZERO;
1476    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1477    let offsets = offsets.as_i64x4();
1478    let slice = slice as *const i8;
1479    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1480    transmute(r)
1481}
1482
1483/// Returns values from `slice` at offsets determined by `offsets * scale`,
1484/// where
1485/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1486/// that position instead.
1487///
1488/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1489#[inline]
1490#[target_feature(enable = "avx2")]
1491#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1492#[rustc_legacy_const_generics(4)]
1493#[stable(feature = "simd_x86", since = "1.27.0")]
1494pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1495    src: __m128i,
1496    slice: *const i32,
1497    offsets: __m256i,
1498    mask: __m128i,
1499) -> __m128i {
1500    static_assert_imm8_scale!(SCALE);
1501    let src = src.as_i32x4();
1502    let mask = mask.as_i32x4();
1503    let offsets = offsets.as_i64x4();
1504    let slice = slice as *const i8;
1505    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1506    transmute(r)
1507}
1508
1509/// Returns values from `slice` at offsets determined by `offsets * scale`,
1510/// where
1511/// `scale` should be 1, 2, 4 or 8.
1512///
1513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1514#[inline]
1515#[target_feature(enable = "avx2")]
1516#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1517#[rustc_legacy_const_generics(2)]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1520    static_assert_imm8_scale!(SCALE);
1521    let zero = _mm_setzero_ps();
1522    let neg_one = _mm_set1_ps(-1.0);
1523    let offsets = offsets.as_i64x2();
1524    let slice = slice as *const i8;
1525    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1526}
1527
1528/// Returns values from `slice` at offsets determined by `offsets * scale`,
1529/// where
1530/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1531/// that position instead.
1532///
1533/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1534#[inline]
1535#[target_feature(enable = "avx2")]
1536#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1537#[rustc_legacy_const_generics(4)]
1538#[stable(feature = "simd_x86", since = "1.27.0")]
1539pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1540    src: __m128,
1541    slice: *const f32,
1542    offsets: __m128i,
1543    mask: __m128,
1544) -> __m128 {
1545    static_assert_imm8_scale!(SCALE);
1546    let offsets = offsets.as_i64x2();
1547    let slice = slice as *const i8;
1548    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1549}
1550
1551/// Returns values from `slice` at offsets determined by `offsets * scale`,
1552/// where
1553/// `scale` should be 1, 2, 4 or 8.
1554///
1555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1556#[inline]
1557#[target_feature(enable = "avx2")]
1558#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1559#[rustc_legacy_const_generics(2)]
1560#[stable(feature = "simd_x86", since = "1.27.0")]
1561pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1562    static_assert_imm8_scale!(SCALE);
1563    let zero = _mm_setzero_ps();
1564    let neg_one = _mm_set1_ps(-1.0);
1565    let offsets = offsets.as_i64x4();
1566    let slice = slice as *const i8;
1567    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1568}
1569
1570/// Returns values from `slice` at offsets determined by `offsets * scale`,
1571/// where
1572/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1573/// that position instead.
1574///
1575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1576#[inline]
1577#[target_feature(enable = "avx2")]
1578#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1579#[rustc_legacy_const_generics(4)]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1582    src: __m128,
1583    slice: *const f32,
1584    offsets: __m256i,
1585    mask: __m128,
1586) -> __m128 {
1587    static_assert_imm8_scale!(SCALE);
1588    let offsets = offsets.as_i64x4();
1589    let slice = slice as *const i8;
1590    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1591}
1592
1593/// Returns values from `slice` at offsets determined by `offsets * scale`,
1594/// where
1595/// `scale` should be 1, 2, 4 or 8.
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1598#[inline]
1599#[target_feature(enable = "avx2")]
1600#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1601#[rustc_legacy_const_generics(2)]
1602#[stable(feature = "simd_x86", since = "1.27.0")]
1603pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1604    slice: *const i64,
1605    offsets: __m128i,
1606) -> __m128i {
1607    static_assert_imm8_scale!(SCALE);
1608    let zero = i64x2::ZERO;
1609    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1610    let slice = slice as *const i8;
1611    let offsets = offsets.as_i64x2();
1612    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1613    transmute(r)
1614}
1615
1616/// Returns values from `slice` at offsets determined by `offsets * scale`,
1617/// where
1618/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1619/// that position instead.
1620///
1621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1622#[inline]
1623#[target_feature(enable = "avx2")]
1624#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1625#[rustc_legacy_const_generics(4)]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1628    src: __m128i,
1629    slice: *const i64,
1630    offsets: __m128i,
1631    mask: __m128i,
1632) -> __m128i {
1633    static_assert_imm8_scale!(SCALE);
1634    let src = src.as_i64x2();
1635    let mask = mask.as_i64x2();
1636    let offsets = offsets.as_i64x2();
1637    let slice = slice as *const i8;
1638    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1639    transmute(r)
1640}
1641
1642/// Returns values from `slice` at offsets determined by `offsets * scale`,
1643/// where
1644/// `scale` should be 1, 2, 4 or 8.
1645///
1646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1647#[inline]
1648#[target_feature(enable = "avx2")]
1649#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1650#[rustc_legacy_const_generics(2)]
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1653    slice: *const i64,
1654    offsets: __m256i,
1655) -> __m256i {
1656    static_assert_imm8_scale!(SCALE);
1657    let zero = i64x4::ZERO;
1658    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1659    let slice = slice as *const i8;
1660    let offsets = offsets.as_i64x4();
1661    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1662    transmute(r)
1663}
1664
1665/// Returns values from `slice` at offsets determined by `offsets * scale`,
1666/// where
1667/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1668/// that position instead.
1669///
1670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1671#[inline]
1672#[target_feature(enable = "avx2")]
1673#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1674#[rustc_legacy_const_generics(4)]
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1677    src: __m256i,
1678    slice: *const i64,
1679    offsets: __m256i,
1680    mask: __m256i,
1681) -> __m256i {
1682    static_assert_imm8_scale!(SCALE);
1683    let src = src.as_i64x4();
1684    let mask = mask.as_i64x4();
1685    let offsets = offsets.as_i64x4();
1686    let slice = slice as *const i8;
1687    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1688    transmute(r)
1689}
1690
1691/// Returns values from `slice` at offsets determined by `offsets * scale`,
1692/// where
1693/// `scale` should be 1, 2, 4 or 8.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1696#[inline]
1697#[target_feature(enable = "avx2")]
1698#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1699#[rustc_legacy_const_generics(2)]
1700#[stable(feature = "simd_x86", since = "1.27.0")]
1701pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1702    static_assert_imm8_scale!(SCALE);
1703    let zero = _mm_setzero_pd();
1704    let neg_one = _mm_set1_pd(-1.0);
1705    let slice = slice as *const i8;
1706    let offsets = offsets.as_i64x2();
1707    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1708}
1709
1710/// Returns values from `slice` at offsets determined by `offsets * scale`,
1711/// where
1712/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1713/// that position instead.
1714///
1715/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1716#[inline]
1717#[target_feature(enable = "avx2")]
1718#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1719#[rustc_legacy_const_generics(4)]
1720#[stable(feature = "simd_x86", since = "1.27.0")]
1721pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1722    src: __m128d,
1723    slice: *const f64,
1724    offsets: __m128i,
1725    mask: __m128d,
1726) -> __m128d {
1727    static_assert_imm8_scale!(SCALE);
1728    let slice = slice as *const i8;
1729    let offsets = offsets.as_i64x2();
1730    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1731}
1732
1733/// Returns values from `slice` at offsets determined by `offsets * scale`,
1734/// where
1735/// `scale` should be 1, 2, 4 or 8.
1736///
1737/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1738#[inline]
1739#[target_feature(enable = "avx2")]
1740#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1741#[rustc_legacy_const_generics(2)]
1742#[stable(feature = "simd_x86", since = "1.27.0")]
1743pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1744    slice: *const f64,
1745    offsets: __m256i,
1746) -> __m256d {
1747    static_assert_imm8_scale!(SCALE);
1748    let zero = _mm256_setzero_pd();
1749    let neg_one = _mm256_set1_pd(-1.0);
1750    let slice = slice as *const i8;
1751    let offsets = offsets.as_i64x4();
1752    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1753}
1754
1755/// Returns values from `slice` at offsets determined by `offsets * scale`,
1756/// where
1757/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1758/// that position instead.
1759///
1760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1761#[inline]
1762#[target_feature(enable = "avx2")]
1763#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1764#[rustc_legacy_const_generics(4)]
1765#[stable(feature = "simd_x86", since = "1.27.0")]
1766pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1767    src: __m256d,
1768    slice: *const f64,
1769    offsets: __m256i,
1770    mask: __m256d,
1771) -> __m256d {
1772    static_assert_imm8_scale!(SCALE);
1773    let slice = slice as *const i8;
1774    let offsets = offsets.as_i64x4();
1775    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1776}
1777
1778/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1779/// location specified by `IMM1`.
1780///
1781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1782#[inline]
1783#[target_feature(enable = "avx2")]
1784#[cfg_attr(
1785    all(test, not(target_env = "msvc")),
1786    assert_instr(vinsertf128, IMM1 = 1)
1787)]
1788#[rustc_legacy_const_generics(2)]
1789#[stable(feature = "simd_x86", since = "1.27.0")]
1790pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1791    static_assert_uimm_bits!(IMM1, 1);
1792    unsafe {
1793        let a = a.as_i64x4();
1794        let b = _mm256_castsi128_si256(b).as_i64x4();
1795        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1796        transmute(dst)
1797    }
1798}
1799
1800/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1801/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1802/// of intermediate 32-bit integers.
1803///
1804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1805#[inline]
1806#[target_feature(enable = "avx2")]
1807#[cfg_attr(test, assert_instr(vpmaddwd))]
1808#[stable(feature = "simd_x86", since = "1.27.0")]
1809pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1810    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1811}
1812
1813/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1814/// corresponding signed 8-bit integer from `b`, producing intermediate
1815/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1816/// signed 16-bit integers
1817///
1818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1819#[inline]
1820#[target_feature(enable = "avx2")]
1821#[cfg_attr(test, assert_instr(vpmaddubsw))]
1822#[stable(feature = "simd_x86", since = "1.27.0")]
1823pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1824    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
1825}
1826
1827/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1828/// (elements are zeroed out when the highest bit is not set in the
1829/// corresponding element).
1830///
1831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1832#[inline]
1833#[target_feature(enable = "avx2")]
1834#[cfg_attr(test, assert_instr(vpmaskmovd))]
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1837    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1838}
1839
1840/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1841/// (elements are zeroed out when the highest bit is not set in the
1842/// corresponding element).
1843///
1844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1845#[inline]
1846#[target_feature(enable = "avx2")]
1847#[cfg_attr(test, assert_instr(vpmaskmovd))]
1848#[stable(feature = "simd_x86", since = "1.27.0")]
1849pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1850    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1851}
1852
1853/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1854/// (elements are zeroed out when the highest bit is not set in the
1855/// corresponding element).
1856///
1857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1858#[inline]
1859#[target_feature(enable = "avx2")]
1860#[cfg_attr(test, assert_instr(vpmaskmovq))]
1861#[stable(feature = "simd_x86", since = "1.27.0")]
1862pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1863    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1864}
1865
1866/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1867/// (elements are zeroed out when the highest bit is not set in the
1868/// corresponding element).
1869///
1870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1871#[inline]
1872#[target_feature(enable = "avx2")]
1873#[cfg_attr(test, assert_instr(vpmaskmovq))]
1874#[stable(feature = "simd_x86", since = "1.27.0")]
1875pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1876    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1877}
1878
1879/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1880/// using `mask` (elements are not stored when the highest bit is not set
1881/// in the corresponding element).
1882///
1883/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1884#[inline]
1885#[target_feature(enable = "avx2")]
1886#[cfg_attr(test, assert_instr(vpmaskmovd))]
1887#[stable(feature = "simd_x86", since = "1.27.0")]
1888pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1889    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1890}
1891
1892/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1893/// using `mask` (elements are not stored when the highest bit is not set
1894/// in the corresponding element).
1895///
1896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1897#[inline]
1898#[target_feature(enable = "avx2")]
1899#[cfg_attr(test, assert_instr(vpmaskmovd))]
1900#[stable(feature = "simd_x86", since = "1.27.0")]
1901pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1902    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1903}
1904
1905/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1906/// using `mask` (elements are not stored when the highest bit is not set
1907/// in the corresponding element).
1908///
1909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1910#[inline]
1911#[target_feature(enable = "avx2")]
1912#[cfg_attr(test, assert_instr(vpmaskmovq))]
1913#[stable(feature = "simd_x86", since = "1.27.0")]
1914pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1915    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1916}
1917
1918/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1919/// using `mask` (elements are not stored when the highest bit is not set
1920/// in the corresponding element).
1921///
1922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1923#[inline]
1924#[target_feature(enable = "avx2")]
1925#[cfg_attr(test, assert_instr(vpmaskmovq))]
1926#[stable(feature = "simd_x86", since = "1.27.0")]
1927pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1928    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1929}
1930
1931/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1932/// maximum values.
1933///
1934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1935#[inline]
1936#[target_feature(enable = "avx2")]
1937#[cfg_attr(test, assert_instr(vpmaxsw))]
1938#[stable(feature = "simd_x86", since = "1.27.0")]
1939pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1940    unsafe {
1941        let a = a.as_i16x16();
1942        let b = b.as_i16x16();
1943        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1944    }
1945}
1946
1947/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1948/// maximum values.
1949///
1950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1951#[inline]
1952#[target_feature(enable = "avx2")]
1953#[cfg_attr(test, assert_instr(vpmaxsd))]
1954#[stable(feature = "simd_x86", since = "1.27.0")]
1955pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1956    unsafe {
1957        let a = a.as_i32x8();
1958        let b = b.as_i32x8();
1959        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1960    }
1961}
1962
1963/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1964/// maximum values.
1965///
1966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1967#[inline]
1968#[target_feature(enable = "avx2")]
1969#[cfg_attr(test, assert_instr(vpmaxsb))]
1970#[stable(feature = "simd_x86", since = "1.27.0")]
1971pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1972    unsafe {
1973        let a = a.as_i8x32();
1974        let b = b.as_i8x32();
1975        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1976    }
1977}
1978
1979/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1980/// the packed maximum values.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1983#[inline]
1984#[target_feature(enable = "avx2")]
1985#[cfg_attr(test, assert_instr(vpmaxuw))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1988    unsafe {
1989        let a = a.as_u16x16();
1990        let b = b.as_u16x16();
1991        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1992    }
1993}
1994
1995/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1996/// the packed maximum values.
1997///
1998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1999#[inline]
2000#[target_feature(enable = "avx2")]
2001#[cfg_attr(test, assert_instr(vpmaxud))]
2002#[stable(feature = "simd_x86", since = "1.27.0")]
2003pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2004    unsafe {
2005        let a = a.as_u32x8();
2006        let b = b.as_u32x8();
2007        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
2008    }
2009}
2010
2011/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2012/// the packed maximum values.
2013///
2014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2015#[inline]
2016#[target_feature(enable = "avx2")]
2017#[cfg_attr(test, assert_instr(vpmaxub))]
2018#[stable(feature = "simd_x86", since = "1.27.0")]
2019pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2020    unsafe {
2021        let a = a.as_u8x32();
2022        let b = b.as_u8x32();
2023        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
2024    }
2025}
2026
2027/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2028/// minimum values.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2031#[inline]
2032#[target_feature(enable = "avx2")]
2033#[cfg_attr(test, assert_instr(vpminsw))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2036    unsafe {
2037        let a = a.as_i16x16();
2038        let b = b.as_i16x16();
2039        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2040    }
2041}
2042
2043/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2044/// minimum values.
2045///
2046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2047#[inline]
2048#[target_feature(enable = "avx2")]
2049#[cfg_attr(test, assert_instr(vpminsd))]
2050#[stable(feature = "simd_x86", since = "1.27.0")]
2051pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2052    unsafe {
2053        let a = a.as_i32x8();
2054        let b = b.as_i32x8();
2055        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2056    }
2057}
2058
2059/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2060/// minimum values.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2063#[inline]
2064#[target_feature(enable = "avx2")]
2065#[cfg_attr(test, assert_instr(vpminsb))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2068    unsafe {
2069        let a = a.as_i8x32();
2070        let b = b.as_i8x32();
2071        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2072    }
2073}
2074
2075/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2076/// the packed minimum values.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2079#[inline]
2080#[target_feature(enable = "avx2")]
2081#[cfg_attr(test, assert_instr(vpminuw))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2084    unsafe {
2085        let a = a.as_u16x16();
2086        let b = b.as_u16x16();
2087        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2088    }
2089}
2090
2091/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2092/// the packed minimum values.
2093///
2094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2095#[inline]
2096#[target_feature(enable = "avx2")]
2097#[cfg_attr(test, assert_instr(vpminud))]
2098#[stable(feature = "simd_x86", since = "1.27.0")]
2099pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2100    unsafe {
2101        let a = a.as_u32x8();
2102        let b = b.as_u32x8();
2103        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2104    }
2105}
2106
2107/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2108/// the packed minimum values.
2109///
2110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2111#[inline]
2112#[target_feature(enable = "avx2")]
2113#[cfg_attr(test, assert_instr(vpminub))]
2114#[stable(feature = "simd_x86", since = "1.27.0")]
2115pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2116    unsafe {
2117        let a = a.as_u8x32();
2118        let b = b.as_u8x32();
2119        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2120    }
2121}
2122
2123/// Creates mask from the most significant bit of each 8-bit element in `a`,
2124/// return the result.
2125///
2126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2127#[inline]
2128#[target_feature(enable = "avx2")]
2129#[cfg_attr(test, assert_instr(vpmovmskb))]
2130#[stable(feature = "simd_x86", since = "1.27.0")]
2131pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2132    unsafe {
2133        let z = i8x32::ZERO;
2134        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2135        simd_bitmask::<_, u32>(m) as i32
2136    }
2137}
2138
2139/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2140/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2141/// results in dst. Eight SADs are performed for each 128-bit lane using one
2142/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2143/// selected from `b` starting at on the offset specified in `imm8`. Eight
2144/// quadruplets are formed from sequential 8-bit integers selected from `a`
2145/// starting at the offset specified in `imm8`.
2146///
2147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2148#[inline]
2149#[target_feature(enable = "avx2")]
2150#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2151#[rustc_legacy_const_generics(2)]
2152#[stable(feature = "simd_x86", since = "1.27.0")]
2153pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2154    static_assert_uimm_bits!(IMM8, 8);
2155    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8)) }
2156}
2157
2158/// Multiplies the low 32-bit integers from each packed 64-bit element in
2159/// `a` and `b`
2160///
2161/// Returns the 64-bit results.
2162///
2163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2164#[inline]
2165#[target_feature(enable = "avx2")]
2166#[cfg_attr(test, assert_instr(vpmuldq))]
2167#[stable(feature = "simd_x86", since = "1.27.0")]
2168pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2169    unsafe {
2170        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2171        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2172        transmute(simd_mul(a, b))
2173    }
2174}
2175
2176/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2177/// element in `a` and `b`
2178///
2179/// Returns the unsigned 64-bit results.
2180///
2181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2182#[inline]
2183#[target_feature(enable = "avx2")]
2184#[cfg_attr(test, assert_instr(vpmuludq))]
2185#[stable(feature = "simd_x86", since = "1.27.0")]
2186pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2187    unsafe {
2188        let a = a.as_u64x4();
2189        let b = b.as_u64x4();
2190        let mask = u64x4::splat(u32::MAX.into());
2191        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2192    }
2193}
2194
2195/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2196/// intermediate 32-bit integers and returning the high 16 bits of the
2197/// intermediate integers.
2198///
2199/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2200#[inline]
2201#[target_feature(enable = "avx2")]
2202#[cfg_attr(test, assert_instr(vpmulhw))]
2203#[stable(feature = "simd_x86", since = "1.27.0")]
2204pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2205    unsafe {
2206        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2207        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2208        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2209        transmute(simd_cast::<i32x16, i16x16>(r))
2210    }
2211}
2212
2213/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2214/// intermediate 32-bit integers and returning the high 16 bits of the
2215/// intermediate integers.
2216///
2217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2218#[inline]
2219#[target_feature(enable = "avx2")]
2220#[cfg_attr(test, assert_instr(vpmulhuw))]
2221#[stable(feature = "simd_x86", since = "1.27.0")]
2222pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2223    unsafe {
2224        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2225        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2226        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2227        transmute(simd_cast::<u32x16, u16x16>(r))
2228    }
2229}
2230
2231/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2232/// intermediate 32-bit integers, and returns the low 16 bits of the
2233/// intermediate integers
2234///
2235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2236#[inline]
2237#[target_feature(enable = "avx2")]
2238#[cfg_attr(test, assert_instr(vpmullw))]
2239#[stable(feature = "simd_x86", since = "1.27.0")]
2240pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2241    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2242}
2243
2244/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2245/// intermediate 64-bit integers, and returns the low 32 bits of the
2246/// intermediate integers
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2249#[inline]
2250#[target_feature(enable = "avx2")]
2251#[cfg_attr(test, assert_instr(vpmulld))]
2252#[stable(feature = "simd_x86", since = "1.27.0")]
2253pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2254    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2255}
2256
2257/// Multiplies packed 16-bit integers in `a` and `b`, producing
2258/// intermediate signed 32-bit integers. Truncate each intermediate
2259/// integer to the 18 most significant bits, round by adding 1, and
2260/// return bits `[16:1]`.
2261///
2262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2263#[inline]
2264#[target_feature(enable = "avx2")]
2265#[cfg_attr(test, assert_instr(vpmulhrsw))]
2266#[stable(feature = "simd_x86", since = "1.27.0")]
2267pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2268    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2269}
2270
2271/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2272/// and `b`
2273///
2274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2275#[inline]
2276#[target_feature(enable = "avx2")]
2277#[cfg_attr(test, assert_instr(vorps))]
2278#[stable(feature = "simd_x86", since = "1.27.0")]
2279pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2280    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2281}
2282
2283/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2284/// using signed saturation
2285///
2286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2287#[inline]
2288#[target_feature(enable = "avx2")]
2289#[cfg_attr(test, assert_instr(vpacksswb))]
2290#[stable(feature = "simd_x86", since = "1.27.0")]
2291pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2292    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2293}
2294
2295/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2296/// using signed saturation
2297///
2298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2299#[inline]
2300#[target_feature(enable = "avx2")]
2301#[cfg_attr(test, assert_instr(vpackssdw))]
2302#[stable(feature = "simd_x86", since = "1.27.0")]
2303pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2304    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2305}
2306
2307/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2308/// using unsigned saturation
2309///
2310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2311#[inline]
2312#[target_feature(enable = "avx2")]
2313#[cfg_attr(test, assert_instr(vpackuswb))]
2314#[stable(feature = "simd_x86", since = "1.27.0")]
2315pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2316    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2317}
2318
2319/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2320/// using unsigned saturation
2321///
2322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2323#[inline]
2324#[target_feature(enable = "avx2")]
2325#[cfg_attr(test, assert_instr(vpackusdw))]
2326#[stable(feature = "simd_x86", since = "1.27.0")]
2327pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2328    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2329}
2330
2331/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2332///
2333/// The last 3 bits of each integer of `b` are used as addresses into the 8
2334/// integers of `a`.
2335///
2336/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2337#[inline]
2338#[target_feature(enable = "avx2")]
2339#[cfg_attr(test, assert_instr(vpermps))]
2340#[stable(feature = "simd_x86", since = "1.27.0")]
2341pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2342    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2343}
2344
2345/// Permutes 64-bit integers from `a` using control mask `imm8`.
2346///
2347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2348#[inline]
2349#[target_feature(enable = "avx2")]
2350#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2351#[rustc_legacy_const_generics(1)]
2352#[stable(feature = "simd_x86", since = "1.27.0")]
2353pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2354    static_assert_uimm_bits!(IMM8, 8);
2355    unsafe {
2356        let zero = i64x4::ZERO;
2357        let r: i64x4 = simd_shuffle!(
2358            a.as_i64x4(),
2359            zero,
2360            [
2361                IMM8 as u32 & 0b11,
2362                (IMM8 as u32 >> 2) & 0b11,
2363                (IMM8 as u32 >> 4) & 0b11,
2364                (IMM8 as u32 >> 6) & 0b11,
2365            ],
2366        );
2367        transmute(r)
2368    }
2369}
2370
2371/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2372///
2373/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2374#[inline]
2375#[target_feature(enable = "avx2")]
2376#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2377#[rustc_legacy_const_generics(2)]
2378#[stable(feature = "simd_x86", since = "1.27.0")]
2379pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2380    static_assert_uimm_bits!(IMM8, 8);
2381    unsafe { transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8)) }
2382}
2383
2384/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2385/// control in `imm8`.
2386///
2387/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2388#[inline]
2389#[target_feature(enable = "avx2")]
2390#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2391#[rustc_legacy_const_generics(1)]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2394    static_assert_uimm_bits!(IMM8, 8);
2395    unsafe {
2396        simd_shuffle!(
2397            a,
2398            _mm256_undefined_pd(),
2399            [
2400                IMM8 as u32 & 0b11,
2401                (IMM8 as u32 >> 2) & 0b11,
2402                (IMM8 as u32 >> 4) & 0b11,
2403                (IMM8 as u32 >> 6) & 0b11,
2404            ],
2405        )
2406    }
2407}
2408
2409/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2410/// the corresponding 32-bit integer index in `idx`.
2411///
2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2413#[inline]
2414#[target_feature(enable = "avx2")]
2415#[cfg_attr(test, assert_instr(vpermps))]
2416#[stable(feature = "simd_x86", since = "1.27.0")]
2417pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2418    unsafe { permps(a, idx.as_i32x8()) }
2419}
2420
2421/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2422/// and `b`, then horizontally sum each consecutive 8 differences to
2423/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2424/// integers in the low 16 bits of the 64-bit return value
2425///
2426/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2427#[inline]
2428#[target_feature(enable = "avx2")]
2429#[cfg_attr(test, assert_instr(vpsadbw))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2432    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2433}
2434
2435/// Shuffles bytes from `a` according to the content of `b`.
2436///
2437/// For each of the 128-bit low and high halves of the vectors, the last
2438/// 4 bits of each byte of `b` are used as addresses into the respective
2439/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2440///
2441/// In addition, if the highest significant bit of a byte of `b` is set, the
2442/// respective destination byte is set to 0.
2443///
2444/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2445/// equivalent to:
2446///
2447/// ```
2448/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2449///     let mut r = [0; 32];
2450///     for i in 0..16 {
2451///         // if the most significant bit of b is set,
2452///         // then the destination byte is set to 0.
2453///         if b[i] & 0x80 == 0u8 {
2454///             r[i] = a[(b[i] % 16) as usize];
2455///         }
2456///         if b[i + 16] & 0x80 == 0u8 {
2457///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2458///         }
2459///     }
2460///     r
2461/// }
2462/// ```
2463///
2464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2465#[inline]
2466#[target_feature(enable = "avx2")]
2467#[cfg_attr(test, assert_instr(vpshufb))]
2468#[stable(feature = "simd_x86", since = "1.27.0")]
2469pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2470    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2471}
2472
2473/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2474/// `imm8`.
2475///
2476/// ```rust
2477/// #[cfg(target_arch = "x86")]
2478/// use std::arch::x86::*;
2479/// #[cfg(target_arch = "x86_64")]
2480/// use std::arch::x86_64::*;
2481///
2482/// # fn main() {
2483/// #     if is_x86_feature_detected!("avx2") {
2484/// #         #[target_feature(enable = "avx2")]
2485/// #         unsafe fn worker() {
2486/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2487///
2488/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2489/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2490///
2491/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2492/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2493///
2494/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2495/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2496/// #         }
2497/// #         unsafe { worker(); }
2498/// #     }
2499/// # }
2500/// ```
2501///
2502/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2503#[inline]
2504#[target_feature(enable = "avx2")]
2505#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2506#[rustc_legacy_const_generics(1)]
2507#[stable(feature = "simd_x86", since = "1.27.0")]
2508pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2509    static_assert_uimm_bits!(MASK, 8);
2510    unsafe {
2511        let r: i32x8 = simd_shuffle!(
2512            a.as_i32x8(),
2513            a.as_i32x8(),
2514            [
2515                MASK as u32 & 0b11,
2516                (MASK as u32 >> 2) & 0b11,
2517                (MASK as u32 >> 4) & 0b11,
2518                (MASK as u32 >> 6) & 0b11,
2519                (MASK as u32 & 0b11) + 4,
2520                ((MASK as u32 >> 2) & 0b11) + 4,
2521                ((MASK as u32 >> 4) & 0b11) + 4,
2522                ((MASK as u32 >> 6) & 0b11) + 4,
2523            ],
2524        );
2525        transmute(r)
2526    }
2527}
2528
2529/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2530/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2531/// to the output.
2532///
2533/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2534#[inline]
2535#[target_feature(enable = "avx2")]
2536#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2537#[rustc_legacy_const_generics(1)]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2540    static_assert_uimm_bits!(IMM8, 8);
2541    unsafe {
2542        let a = a.as_i16x16();
2543        let r: i16x16 = simd_shuffle!(
2544            a,
2545            a,
2546            [
2547                0,
2548                1,
2549                2,
2550                3,
2551                4 + (IMM8 as u32 & 0b11),
2552                4 + ((IMM8 as u32 >> 2) & 0b11),
2553                4 + ((IMM8 as u32 >> 4) & 0b11),
2554                4 + ((IMM8 as u32 >> 6) & 0b11),
2555                8,
2556                9,
2557                10,
2558                11,
2559                12 + (IMM8 as u32 & 0b11),
2560                12 + ((IMM8 as u32 >> 2) & 0b11),
2561                12 + ((IMM8 as u32 >> 4) & 0b11),
2562                12 + ((IMM8 as u32 >> 6) & 0b11),
2563            ],
2564        );
2565        transmute(r)
2566    }
2567}
2568
2569/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2570/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2571/// to the output.
2572///
2573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2574#[inline]
2575#[target_feature(enable = "avx2")]
2576#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2577#[rustc_legacy_const_generics(1)]
2578#[stable(feature = "simd_x86", since = "1.27.0")]
2579pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2580    static_assert_uimm_bits!(IMM8, 8);
2581    unsafe {
2582        let a = a.as_i16x16();
2583        let r: i16x16 = simd_shuffle!(
2584            a,
2585            a,
2586            [
2587                0 + (IMM8 as u32 & 0b11),
2588                0 + ((IMM8 as u32 >> 2) & 0b11),
2589                0 + ((IMM8 as u32 >> 4) & 0b11),
2590                0 + ((IMM8 as u32 >> 6) & 0b11),
2591                4,
2592                5,
2593                6,
2594                7,
2595                8 + (IMM8 as u32 & 0b11),
2596                8 + ((IMM8 as u32 >> 2) & 0b11),
2597                8 + ((IMM8 as u32 >> 4) & 0b11),
2598                8 + ((IMM8 as u32 >> 6) & 0b11),
2599                12,
2600                13,
2601                14,
2602                15,
2603            ],
2604        );
2605        transmute(r)
2606    }
2607}
2608
2609/// Negates packed 16-bit integers in `a` when the corresponding signed
2610/// 16-bit integer in `b` is negative, and returns the results.
2611/// Results are zeroed out when the corresponding element in `b` is zero.
2612///
2613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2614#[inline]
2615#[target_feature(enable = "avx2")]
2616#[cfg_attr(test, assert_instr(vpsignw))]
2617#[stable(feature = "simd_x86", since = "1.27.0")]
2618pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2619    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2620}
2621
2622/// Negates packed 32-bit integers in `a` when the corresponding signed
2623/// 32-bit integer in `b` is negative, and returns the results.
2624/// Results are zeroed out when the corresponding element in `b` is zero.
2625///
2626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2627#[inline]
2628#[target_feature(enable = "avx2")]
2629#[cfg_attr(test, assert_instr(vpsignd))]
2630#[stable(feature = "simd_x86", since = "1.27.0")]
2631pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2632    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2633}
2634
2635/// Negates packed 8-bit integers in `a` when the corresponding signed
2636/// 8-bit integer in `b` is negative, and returns the results.
2637/// Results are zeroed out when the corresponding element in `b` is zero.
2638///
2639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2640#[inline]
2641#[target_feature(enable = "avx2")]
2642#[cfg_attr(test, assert_instr(vpsignb))]
2643#[stable(feature = "simd_x86", since = "1.27.0")]
2644pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2645    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2646}
2647
2648/// Shifts packed 16-bit integers in `a` left by `count` while
2649/// shifting in zeros, and returns the result
2650///
2651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2652#[inline]
2653#[target_feature(enable = "avx2")]
2654#[cfg_attr(test, assert_instr(vpsllw))]
2655#[stable(feature = "simd_x86", since = "1.27.0")]
2656pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2657    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2658}
2659
2660/// Shifts packed 32-bit integers in `a` left by `count` while
2661/// shifting in zeros, and returns the result
2662///
2663/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2664#[inline]
2665#[target_feature(enable = "avx2")]
2666#[cfg_attr(test, assert_instr(vpslld))]
2667#[stable(feature = "simd_x86", since = "1.27.0")]
2668pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2669    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2670}
2671
2672/// Shifts packed 64-bit integers in `a` left by `count` while
2673/// shifting in zeros, and returns the result
2674///
2675/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2676#[inline]
2677#[target_feature(enable = "avx2")]
2678#[cfg_attr(test, assert_instr(vpsllq))]
2679#[stable(feature = "simd_x86", since = "1.27.0")]
2680pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2681    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2682}
2683
2684/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2685/// shifting in zeros, return the results;
2686///
2687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2688#[inline]
2689#[target_feature(enable = "avx2")]
2690#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2691#[rustc_legacy_const_generics(1)]
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2694    static_assert_uimm_bits!(IMM8, 8);
2695    unsafe {
2696        if IMM8 >= 16 {
2697            _mm256_setzero_si256()
2698        } else {
2699            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2700        }
2701    }
2702}
2703
2704/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2705/// shifting in zeros, return the results;
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2708#[inline]
2709#[target_feature(enable = "avx2")]
2710#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2711#[rustc_legacy_const_generics(1)]
2712#[stable(feature = "simd_x86", since = "1.27.0")]
2713pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2714    unsafe {
2715        static_assert_uimm_bits!(IMM8, 8);
2716        if IMM8 >= 32 {
2717            _mm256_setzero_si256()
2718        } else {
2719            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2720        }
2721    }
2722}
2723
2724/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2725/// shifting in zeros, return the results;
2726///
2727/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2728#[inline]
2729#[target_feature(enable = "avx2")]
2730#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2731#[rustc_legacy_const_generics(1)]
2732#[stable(feature = "simd_x86", since = "1.27.0")]
2733pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2734    unsafe {
2735        static_assert_uimm_bits!(IMM8, 8);
2736        if IMM8 >= 64 {
2737            _mm256_setzero_si256()
2738        } else {
2739            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2740        }
2741    }
2742}
2743
2744/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2745///
2746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2747#[inline]
2748#[target_feature(enable = "avx2")]
2749#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2750#[rustc_legacy_const_generics(1)]
2751#[stable(feature = "simd_x86", since = "1.27.0")]
2752pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2753    static_assert_uimm_bits!(IMM8, 8);
2754    _mm256_bslli_epi128::<IMM8>(a)
2755}
2756
2757/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2758///
2759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2760#[inline]
2761#[target_feature(enable = "avx2")]
2762#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2763#[rustc_legacy_const_generics(1)]
2764#[stable(feature = "simd_x86", since = "1.27.0")]
2765pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2766    static_assert_uimm_bits!(IMM8, 8);
2767    const fn mask(shift: i32, i: u32) -> u32 {
2768        let shift = shift as u32 & 0xff;
2769        if shift > 15 || i % 16 < shift {
2770            0
2771        } else {
2772            32 + (i - shift)
2773        }
2774    }
2775    unsafe {
2776        let a = a.as_i8x32();
2777        let r: i8x32 = simd_shuffle!(
2778            i8x32::ZERO,
2779            a,
2780            [
2781                mask(IMM8, 0),
2782                mask(IMM8, 1),
2783                mask(IMM8, 2),
2784                mask(IMM8, 3),
2785                mask(IMM8, 4),
2786                mask(IMM8, 5),
2787                mask(IMM8, 6),
2788                mask(IMM8, 7),
2789                mask(IMM8, 8),
2790                mask(IMM8, 9),
2791                mask(IMM8, 10),
2792                mask(IMM8, 11),
2793                mask(IMM8, 12),
2794                mask(IMM8, 13),
2795                mask(IMM8, 14),
2796                mask(IMM8, 15),
2797                mask(IMM8, 16),
2798                mask(IMM8, 17),
2799                mask(IMM8, 18),
2800                mask(IMM8, 19),
2801                mask(IMM8, 20),
2802                mask(IMM8, 21),
2803                mask(IMM8, 22),
2804                mask(IMM8, 23),
2805                mask(IMM8, 24),
2806                mask(IMM8, 25),
2807                mask(IMM8, 26),
2808                mask(IMM8, 27),
2809                mask(IMM8, 28),
2810                mask(IMM8, 29),
2811                mask(IMM8, 30),
2812                mask(IMM8, 31),
2813            ],
2814        );
2815        transmute(r)
2816    }
2817}
2818
2819/// Shifts packed 32-bit integers in `a` left by the amount
2820/// specified by the corresponding element in `count` while
2821/// shifting in zeros, and returns the result.
2822///
2823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2824#[inline]
2825#[target_feature(enable = "avx2")]
2826#[cfg_attr(test, assert_instr(vpsllvd))]
2827#[stable(feature = "simd_x86", since = "1.27.0")]
2828pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2829    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
2830}
2831
2832/// Shifts packed 32-bit integers in `a` left by the amount
2833/// specified by the corresponding element in `count` while
2834/// shifting in zeros, and returns the result.
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2837#[inline]
2838#[target_feature(enable = "avx2")]
2839#[cfg_attr(test, assert_instr(vpsllvd))]
2840#[stable(feature = "simd_x86", since = "1.27.0")]
2841pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2842    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
2843}
2844
2845/// Shifts packed 64-bit integers in `a` left by the amount
2846/// specified by the corresponding element in `count` while
2847/// shifting in zeros, and returns the result.
2848///
2849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2850#[inline]
2851#[target_feature(enable = "avx2")]
2852#[cfg_attr(test, assert_instr(vpsllvq))]
2853#[stable(feature = "simd_x86", since = "1.27.0")]
2854pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2855    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
2856}
2857
2858/// Shifts packed 64-bit integers in `a` left by the amount
2859/// specified by the corresponding element in `count` while
2860/// shifting in zeros, and returns the result.
2861///
2862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2863#[inline]
2864#[target_feature(enable = "avx2")]
2865#[cfg_attr(test, assert_instr(vpsllvq))]
2866#[stable(feature = "simd_x86", since = "1.27.0")]
2867pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2868    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
2869}
2870
2871/// Shifts packed 16-bit integers in `a` right by `count` while
2872/// shifting in sign bits.
2873///
2874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2875#[inline]
2876#[target_feature(enable = "avx2")]
2877#[cfg_attr(test, assert_instr(vpsraw))]
2878#[stable(feature = "simd_x86", since = "1.27.0")]
2879pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2880    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2881}
2882
2883/// Shifts packed 32-bit integers in `a` right by `count` while
2884/// shifting in sign bits.
2885///
2886/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2887#[inline]
2888#[target_feature(enable = "avx2")]
2889#[cfg_attr(test, assert_instr(vpsrad))]
2890#[stable(feature = "simd_x86", since = "1.27.0")]
2891pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2892    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2893}
2894
2895/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2896/// shifting in sign bits.
2897///
2898/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2899#[inline]
2900#[target_feature(enable = "avx2")]
2901#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2902#[rustc_legacy_const_generics(1)]
2903#[stable(feature = "simd_x86", since = "1.27.0")]
2904pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2905    static_assert_uimm_bits!(IMM8, 8);
2906    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2907}
2908
2909/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2910/// shifting in sign bits.
2911///
2912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2913#[inline]
2914#[target_feature(enable = "avx2")]
2915#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2916#[rustc_legacy_const_generics(1)]
2917#[stable(feature = "simd_x86", since = "1.27.0")]
2918pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2919    static_assert_uimm_bits!(IMM8, 8);
2920    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2921}
2922
2923/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2924/// corresponding element in `count` while shifting in sign bits.
2925///
2926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2927#[inline]
2928#[target_feature(enable = "avx2")]
2929#[cfg_attr(test, assert_instr(vpsravd))]
2930#[stable(feature = "simd_x86", since = "1.27.0")]
2931pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2932    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
2933}
2934
2935/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2936/// corresponding element in `count` while shifting in sign bits.
2937///
2938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2939#[inline]
2940#[target_feature(enable = "avx2")]
2941#[cfg_attr(test, assert_instr(vpsravd))]
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2944    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
2945}
2946
2947/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2948///
2949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2950#[inline]
2951#[target_feature(enable = "avx2")]
2952#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2953#[rustc_legacy_const_generics(1)]
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2956    static_assert_uimm_bits!(IMM8, 8);
2957    _mm256_bsrli_epi128::<IMM8>(a)
2958}
2959
2960/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2961///
2962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2963#[inline]
2964#[target_feature(enable = "avx2")]
2965#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2966#[rustc_legacy_const_generics(1)]
2967#[stable(feature = "simd_x86", since = "1.27.0")]
2968pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2969    static_assert_uimm_bits!(IMM8, 8);
2970    unsafe {
2971        let a = a.as_i8x32();
2972        let zero = i8x32::ZERO;
2973        let r: i8x32 = match IMM8 % 16 {
2974            0 => simd_shuffle!(
2975                a,
2976                zero,
2977                [
2978                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
2979                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2980                ],
2981            ),
2982            1 => simd_shuffle!(
2983                a,
2984                zero,
2985                [
2986                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22,
2987                    23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2988                ],
2989            ),
2990            2 => simd_shuffle!(
2991                a,
2992                zero,
2993                [
2994                    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23,
2995                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32,
2996                ],
2997            ),
2998            3 => simd_shuffle!(
2999                a,
3000                zero,
3001                [
3002                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23,
3003                    24, 25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
3004                ],
3005            ),
3006            4 => simd_shuffle!(
3007                a,
3008                zero,
3009                [
3010                    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24,
3011                    25, 26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
3012                ],
3013            ),
3014            5 => simd_shuffle!(
3015                a,
3016                zero,
3017                [
3018                    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25,
3019                    26, 27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
3020                ],
3021            ),
3022            6 => simd_shuffle!(
3023                a,
3024                zero,
3025                [
3026                    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26,
3027                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
3028                ],
3029            ),
3030            7 => simd_shuffle!(
3031                a,
3032                zero,
3033                [
3034                    7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26,
3035                    27, 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
3036                ],
3037            ),
3038            8 => simd_shuffle!(
3039                a,
3040                zero,
3041                [
3042                    8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27,
3043                    28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
3044                ],
3045            ),
3046            9 => simd_shuffle!(
3047                a,
3048                zero,
3049                [
3050                    9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28,
3051                    29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3052                ],
3053            ),
3054            10 => simd_shuffle!(
3055                a,
3056                zero,
3057                [
3058                    10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29,
3059                    30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3060                ],
3061            ),
3062            11 => simd_shuffle!(
3063                a,
3064                zero,
3065                [
3066                    11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30,
3067                    31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3068                ],
3069            ),
3070            12 => simd_shuffle!(
3071                a,
3072                zero,
3073                [
3074                    12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31,
3075                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3076                ],
3077            ),
3078            13 => simd_shuffle!(
3079                a,
3080                zero,
3081                [
3082                    13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32,
3083                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3084                ],
3085            ),
3086            14 => simd_shuffle!(
3087                a,
3088                zero,
3089                [
3090                    14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32,
3091                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3092                ],
3093            ),
3094            15 => simd_shuffle!(
3095                a,
3096                zero,
3097                [
3098                    15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
3099                    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
3100                ],
3101            ),
3102            _ => zero,
3103        };
3104        transmute(r)
3105    }
3106}
3107
3108/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3109/// zeros.
3110///
3111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3112#[inline]
3113#[target_feature(enable = "avx2")]
3114#[cfg_attr(test, assert_instr(vpsrlw))]
3115#[stable(feature = "simd_x86", since = "1.27.0")]
3116pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3117    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3118}
3119
3120/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3121/// zeros.
3122///
3123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3124#[inline]
3125#[target_feature(enable = "avx2")]
3126#[cfg_attr(test, assert_instr(vpsrld))]
3127#[stable(feature = "simd_x86", since = "1.27.0")]
3128pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3129    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3130}
3131
3132/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3133/// zeros.
3134///
3135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3136#[inline]
3137#[target_feature(enable = "avx2")]
3138#[cfg_attr(test, assert_instr(vpsrlq))]
3139#[stable(feature = "simd_x86", since = "1.27.0")]
3140pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3141    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3142}
3143
3144/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3145/// zeros
3146///
3147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3148#[inline]
3149#[target_feature(enable = "avx2")]
3150#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3151#[rustc_legacy_const_generics(1)]
3152#[stable(feature = "simd_x86", since = "1.27.0")]
3153pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3154    static_assert_uimm_bits!(IMM8, 8);
3155    unsafe {
3156        if IMM8 >= 16 {
3157            _mm256_setzero_si256()
3158        } else {
3159            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3160        }
3161    }
3162}
3163
3164/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3165/// zeros
3166///
3167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3168#[inline]
3169#[target_feature(enable = "avx2")]
3170#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3171#[rustc_legacy_const_generics(1)]
3172#[stable(feature = "simd_x86", since = "1.27.0")]
3173pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3174    static_assert_uimm_bits!(IMM8, 8);
3175    unsafe {
3176        if IMM8 >= 32 {
3177            _mm256_setzero_si256()
3178        } else {
3179            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3180        }
3181    }
3182}
3183
3184/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3185/// zeros
3186///
3187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3188#[inline]
3189#[target_feature(enable = "avx2")]
3190#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3191#[rustc_legacy_const_generics(1)]
3192#[stable(feature = "simd_x86", since = "1.27.0")]
3193pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3194    static_assert_uimm_bits!(IMM8, 8);
3195    unsafe {
3196        if IMM8 >= 64 {
3197            _mm256_setzero_si256()
3198        } else {
3199            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3200        }
3201    }
3202}
3203
3204/// Shifts packed 32-bit integers in `a` right by the amount specified by
3205/// the corresponding element in `count` while shifting in zeros,
3206///
3207/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3208#[inline]
3209#[target_feature(enable = "avx2")]
3210#[cfg_attr(test, assert_instr(vpsrlvd))]
3211#[stable(feature = "simd_x86", since = "1.27.0")]
3212pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3213    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
3214}
3215
3216/// Shifts packed 32-bit integers in `a` right by the amount specified by
3217/// the corresponding element in `count` while shifting in zeros,
3218///
3219/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3220#[inline]
3221#[target_feature(enable = "avx2")]
3222#[cfg_attr(test, assert_instr(vpsrlvd))]
3223#[stable(feature = "simd_x86", since = "1.27.0")]
3224pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3225    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
3226}
3227
3228/// Shifts packed 64-bit integers in `a` right by the amount specified by
3229/// the corresponding element in `count` while shifting in zeros,
3230///
3231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3232#[inline]
3233#[target_feature(enable = "avx2")]
3234#[cfg_attr(test, assert_instr(vpsrlvq))]
3235#[stable(feature = "simd_x86", since = "1.27.0")]
3236pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3237    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
3238}
3239
3240/// Shifts packed 64-bit integers in `a` right by the amount specified by
3241/// the corresponding element in `count` while shifting in zeros,
3242///
3243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3244#[inline]
3245#[target_feature(enable = "avx2")]
3246#[cfg_attr(test, assert_instr(vpsrlvq))]
3247#[stable(feature = "simd_x86", since = "1.27.0")]
3248pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3249    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
3250}
3251
3252/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3253/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3254/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3255///
3256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3257#[inline]
3258#[target_feature(enable = "avx2")]
3259#[cfg_attr(test, assert_instr(vmovntdqa))]
3260#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3261pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3262    let dst: __m256i;
3263    crate::arch::asm!(
3264        vpl!("vmovntdqa {a}"),
3265        a = out(ymm_reg) dst,
3266        p = in(reg) mem_addr,
3267        options(pure, readonly, nostack, preserves_flags),
3268    );
3269    dst
3270}
3271
3272/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3273///
3274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3275#[inline]
3276#[target_feature(enable = "avx2")]
3277#[cfg_attr(test, assert_instr(vpsubw))]
3278#[stable(feature = "simd_x86", since = "1.27.0")]
3279pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3280    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3281}
3282
3283/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3284///
3285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3286#[inline]
3287#[target_feature(enable = "avx2")]
3288#[cfg_attr(test, assert_instr(vpsubd))]
3289#[stable(feature = "simd_x86", since = "1.27.0")]
3290pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3291    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3292}
3293
3294/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3295///
3296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3297#[inline]
3298#[target_feature(enable = "avx2")]
3299#[cfg_attr(test, assert_instr(vpsubq))]
3300#[stable(feature = "simd_x86", since = "1.27.0")]
3301pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3302    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3303}
3304
3305/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3306///
3307/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3308#[inline]
3309#[target_feature(enable = "avx2")]
3310#[cfg_attr(test, assert_instr(vpsubb))]
3311#[stable(feature = "simd_x86", since = "1.27.0")]
3312pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3313    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3314}
3315
3316/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3317/// `a` using saturation.
3318///
3319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3320#[inline]
3321#[target_feature(enable = "avx2")]
3322#[cfg_attr(test, assert_instr(vpsubsw))]
3323#[stable(feature = "simd_x86", since = "1.27.0")]
3324pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3325    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3326}
3327
3328/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3329/// `a` using saturation.
3330///
3331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3332#[inline]
3333#[target_feature(enable = "avx2")]
3334#[cfg_attr(test, assert_instr(vpsubsb))]
3335#[stable(feature = "simd_x86", since = "1.27.0")]
3336pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3337    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3338}
3339
3340/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3341/// integers in `a` using saturation.
3342///
3343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3344#[inline]
3345#[target_feature(enable = "avx2")]
3346#[cfg_attr(test, assert_instr(vpsubusw))]
3347#[stable(feature = "simd_x86", since = "1.27.0")]
3348pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3349    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3350}
3351
3352/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3353/// integers in `a` using saturation.
3354///
3355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3356#[inline]
3357#[target_feature(enable = "avx2")]
3358#[cfg_attr(test, assert_instr(vpsubusb))]
3359#[stable(feature = "simd_x86", since = "1.27.0")]
3360pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3361    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3362}
3363
3364/// Unpacks and interleave 8-bit integers from the high half of each
3365/// 128-bit lane in `a` and `b`.
3366///
3367/// ```rust
3368/// #[cfg(target_arch = "x86")]
3369/// use std::arch::x86::*;
3370/// #[cfg(target_arch = "x86_64")]
3371/// use std::arch::x86_64::*;
3372///
3373/// # fn main() {
3374/// #     if is_x86_feature_detected!("avx2") {
3375/// #         #[target_feature(enable = "avx2")]
3376/// #         unsafe fn worker() {
3377/// let a = _mm256_setr_epi8(
3378///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3379///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3380/// );
3381/// let b = _mm256_setr_epi8(
3382///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3383///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3384///     -30, -31,
3385/// );
3386///
3387/// let c = _mm256_unpackhi_epi8(a, b);
3388///
3389/// let expected = _mm256_setr_epi8(
3390///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3391///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3392///     -31,
3393/// );
3394/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3395///
3396/// #         }
3397/// #         unsafe { worker(); }
3398/// #     }
3399/// # }
3400/// ```
3401///
3402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3403#[inline]
3404#[target_feature(enable = "avx2")]
3405#[cfg_attr(test, assert_instr(vpunpckhbw))]
3406#[stable(feature = "simd_x86", since = "1.27.0")]
3407pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3408    unsafe {
3409        #[rustfmt::skip]
3410        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3411                8, 40, 9, 41, 10, 42, 11, 43,
3412                12, 44, 13, 45, 14, 46, 15, 47,
3413                24, 56, 25, 57, 26, 58, 27, 59,
3414                28, 60, 29, 61, 30, 62, 31, 63,
3415        ]);
3416        transmute(r)
3417    }
3418}
3419
3420/// Unpacks and interleave 8-bit integers from the low half of each
3421/// 128-bit lane of `a` and `b`.
3422///
3423/// ```rust
3424/// #[cfg(target_arch = "x86")]
3425/// use std::arch::x86::*;
3426/// #[cfg(target_arch = "x86_64")]
3427/// use std::arch::x86_64::*;
3428///
3429/// # fn main() {
3430/// #     if is_x86_feature_detected!("avx2") {
3431/// #         #[target_feature(enable = "avx2")]
3432/// #         unsafe fn worker() {
3433/// let a = _mm256_setr_epi8(
3434///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3435///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3436/// );
3437/// let b = _mm256_setr_epi8(
3438///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3439///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3440///     -30, -31,
3441/// );
3442///
3443/// let c = _mm256_unpacklo_epi8(a, b);
3444///
3445/// let expected = _mm256_setr_epi8(
3446///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3447///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3448/// );
3449/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3450///
3451/// #         }
3452/// #         unsafe { worker(); }
3453/// #     }
3454/// # }
3455/// ```
3456///
3457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3458#[inline]
3459#[target_feature(enable = "avx2")]
3460#[cfg_attr(test, assert_instr(vpunpcklbw))]
3461#[stable(feature = "simd_x86", since = "1.27.0")]
3462pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3463    unsafe {
3464        #[rustfmt::skip]
3465        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3466            0, 32, 1, 33, 2, 34, 3, 35,
3467            4, 36, 5, 37, 6, 38, 7, 39,
3468            16, 48, 17, 49, 18, 50, 19, 51,
3469            20, 52, 21, 53, 22, 54, 23, 55,
3470        ]);
3471        transmute(r)
3472    }
3473}
3474
3475/// Unpacks and interleave 16-bit integers from the high half of each
3476/// 128-bit lane of `a` and `b`.
3477///
3478/// ```rust
3479/// #[cfg(target_arch = "x86")]
3480/// use std::arch::x86::*;
3481/// #[cfg(target_arch = "x86_64")]
3482/// use std::arch::x86_64::*;
3483///
3484/// # fn main() {
3485/// #     if is_x86_feature_detected!("avx2") {
3486/// #         #[target_feature(enable = "avx2")]
3487/// #         unsafe fn worker() {
3488/// let a = _mm256_setr_epi16(
3489///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3490/// );
3491/// let b = _mm256_setr_epi16(
3492///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3493/// );
3494///
3495/// let c = _mm256_unpackhi_epi16(a, b);
3496///
3497/// let expected = _mm256_setr_epi16(
3498///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3499/// );
3500/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3501///
3502/// #         }
3503/// #         unsafe { worker(); }
3504/// #     }
3505/// # }
3506/// ```
3507///
3508/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3509#[inline]
3510#[target_feature(enable = "avx2")]
3511#[cfg_attr(test, assert_instr(vpunpckhwd))]
3512#[stable(feature = "simd_x86", since = "1.27.0")]
3513pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3514    unsafe {
3515        let r: i16x16 = simd_shuffle!(
3516            a.as_i16x16(),
3517            b.as_i16x16(),
3518            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3519        );
3520        transmute(r)
3521    }
3522}
3523
3524/// Unpacks and interleave 16-bit integers from the low half of each
3525/// 128-bit lane of `a` and `b`.
3526///
3527/// ```rust
3528/// #[cfg(target_arch = "x86")]
3529/// use std::arch::x86::*;
3530/// #[cfg(target_arch = "x86_64")]
3531/// use std::arch::x86_64::*;
3532///
3533/// # fn main() {
3534/// #     if is_x86_feature_detected!("avx2") {
3535/// #         #[target_feature(enable = "avx2")]
3536/// #         unsafe fn worker() {
3537///
3538/// let a = _mm256_setr_epi16(
3539///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3540/// );
3541/// let b = _mm256_setr_epi16(
3542///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3543/// );
3544///
3545/// let c = _mm256_unpacklo_epi16(a, b);
3546///
3547/// let expected = _mm256_setr_epi16(
3548///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3549/// );
3550/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3551///
3552/// #         }
3553/// #         unsafe { worker(); }
3554/// #     }
3555/// # }
3556/// ```
3557///
3558/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3559#[inline]
3560#[target_feature(enable = "avx2")]
3561#[cfg_attr(test, assert_instr(vpunpcklwd))]
3562#[stable(feature = "simd_x86", since = "1.27.0")]
3563pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3564    unsafe {
3565        let r: i16x16 = simd_shuffle!(
3566            a.as_i16x16(),
3567            b.as_i16x16(),
3568            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3569        );
3570        transmute(r)
3571    }
3572}
3573
3574/// Unpacks and interleave 32-bit integers from the high half of each
3575/// 128-bit lane of `a` and `b`.
3576///
3577/// ```rust
3578/// #[cfg(target_arch = "x86")]
3579/// use std::arch::x86::*;
3580/// #[cfg(target_arch = "x86_64")]
3581/// use std::arch::x86_64::*;
3582///
3583/// # fn main() {
3584/// #     if is_x86_feature_detected!("avx2") {
3585/// #         #[target_feature(enable = "avx2")]
3586/// #         unsafe fn worker() {
3587/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3588/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3589///
3590/// let c = _mm256_unpackhi_epi32(a, b);
3591///
3592/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3593/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3594///
3595/// #         }
3596/// #         unsafe { worker(); }
3597/// #     }
3598/// # }
3599/// ```
3600///
3601/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3602#[inline]
3603#[target_feature(enable = "avx2")]
3604#[cfg_attr(test, assert_instr(vunpckhps))]
3605#[stable(feature = "simd_x86", since = "1.27.0")]
3606pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3607    unsafe {
3608        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3609        transmute(r)
3610    }
3611}
3612
3613/// Unpacks and interleave 32-bit integers from the low half of each
3614/// 128-bit lane of `a` and `b`.
3615///
3616/// ```rust
3617/// #[cfg(target_arch = "x86")]
3618/// use std::arch::x86::*;
3619/// #[cfg(target_arch = "x86_64")]
3620/// use std::arch::x86_64::*;
3621///
3622/// # fn main() {
3623/// #     if is_x86_feature_detected!("avx2") {
3624/// #         #[target_feature(enable = "avx2")]
3625/// #         unsafe fn worker() {
3626/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3627/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3628///
3629/// let c = _mm256_unpacklo_epi32(a, b);
3630///
3631/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3632/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3633///
3634/// #         }
3635/// #         unsafe { worker(); }
3636/// #     }
3637/// # }
3638/// ```
3639///
3640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3641#[inline]
3642#[target_feature(enable = "avx2")]
3643#[cfg_attr(test, assert_instr(vunpcklps))]
3644#[stable(feature = "simd_x86", since = "1.27.0")]
3645pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3646    unsafe {
3647        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3648        transmute(r)
3649    }
3650}
3651
3652/// Unpacks and interleave 64-bit integers from the high half of each
3653/// 128-bit lane of `a` and `b`.
3654///
3655/// ```rust
3656/// #[cfg(target_arch = "x86")]
3657/// use std::arch::x86::*;
3658/// #[cfg(target_arch = "x86_64")]
3659/// use std::arch::x86_64::*;
3660///
3661/// # fn main() {
3662/// #     if is_x86_feature_detected!("avx2") {
3663/// #         #[target_feature(enable = "avx2")]
3664/// #         unsafe fn worker() {
3665/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3666/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3667///
3668/// let c = _mm256_unpackhi_epi64(a, b);
3669///
3670/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3671/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3672///
3673/// #         }
3674/// #         unsafe { worker(); }
3675/// #     }
3676/// # }
3677/// ```
3678///
3679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3680#[inline]
3681#[target_feature(enable = "avx2")]
3682#[cfg_attr(test, assert_instr(vunpckhpd))]
3683#[stable(feature = "simd_x86", since = "1.27.0")]
3684pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3685    unsafe {
3686        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3687        transmute(r)
3688    }
3689}
3690
3691/// Unpacks and interleave 64-bit integers from the low half of each
3692/// 128-bit lane of `a` and `b`.
3693///
3694/// ```rust
3695/// #[cfg(target_arch = "x86")]
3696/// use std::arch::x86::*;
3697/// #[cfg(target_arch = "x86_64")]
3698/// use std::arch::x86_64::*;
3699///
3700/// # fn main() {
3701/// #     if is_x86_feature_detected!("avx2") {
3702/// #         #[target_feature(enable = "avx2")]
3703/// #         unsafe fn worker() {
3704/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3705/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3706///
3707/// let c = _mm256_unpacklo_epi64(a, b);
3708///
3709/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3710/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3711///
3712/// #         }
3713/// #         unsafe { worker(); }
3714/// #     }
3715/// # }
3716/// ```
3717///
3718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3719#[inline]
3720#[target_feature(enable = "avx2")]
3721#[cfg_attr(test, assert_instr(vunpcklpd))]
3722#[stable(feature = "simd_x86", since = "1.27.0")]
3723pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3724    unsafe {
3725        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3726        transmute(r)
3727    }
3728}
3729
3730/// Computes the bitwise XOR of 256 bits (representing integer data)
3731/// in `a` and `b`
3732///
3733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3734#[inline]
3735#[target_feature(enable = "avx2")]
3736#[cfg_attr(test, assert_instr(vxorps))]
3737#[stable(feature = "simd_x86", since = "1.27.0")]
3738pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3739    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3740}
3741
3742/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3743/// integer containing the zero-extended integer data.
3744///
3745/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3746///
3747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3748#[inline]
3749#[target_feature(enable = "avx2")]
3750// This intrinsic has no corresponding instruction.
3751#[rustc_legacy_const_generics(1)]
3752#[stable(feature = "simd_x86", since = "1.27.0")]
3753pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3754    static_assert_uimm_bits!(INDEX, 5);
3755    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3756}
3757
3758/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3759/// integer containing the zero-extended integer data.
3760///
3761/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3762///
3763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3764#[inline]
3765#[target_feature(enable = "avx2")]
3766// This intrinsic has no corresponding instruction.
3767#[rustc_legacy_const_generics(1)]
3768#[stable(feature = "simd_x86", since = "1.27.0")]
3769pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3770    static_assert_uimm_bits!(INDEX, 4);
3771    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3772}
3773
3774#[allow(improper_ctypes)]
3775unsafe extern "C" {
3776    #[link_name = "llvm.x86.avx2.phadd.w"]
3777    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3778    #[link_name = "llvm.x86.avx2.phadd.d"]
3779    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3780    #[link_name = "llvm.x86.avx2.phadd.sw"]
3781    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3782    #[link_name = "llvm.x86.avx2.phsub.w"]
3783    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3784    #[link_name = "llvm.x86.avx2.phsub.d"]
3785    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3786    #[link_name = "llvm.x86.avx2.phsub.sw"]
3787    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3788    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3789    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3790    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3791    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3792    #[link_name = "llvm.x86.avx2.maskload.d"]
3793    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3794    #[link_name = "llvm.x86.avx2.maskload.d.256"]
3795    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3796    #[link_name = "llvm.x86.avx2.maskload.q"]
3797    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3798    #[link_name = "llvm.x86.avx2.maskload.q.256"]
3799    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3800    #[link_name = "llvm.x86.avx2.maskstore.d"]
3801    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3802    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3803    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3804    #[link_name = "llvm.x86.avx2.maskstore.q"]
3805    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3806    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3807    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3808    #[link_name = "llvm.x86.avx2.mpsadbw"]
3809    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3810    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3811    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3812    #[link_name = "llvm.x86.avx2.packsswb"]
3813    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3814    #[link_name = "llvm.x86.avx2.packssdw"]
3815    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3816    #[link_name = "llvm.x86.avx2.packuswb"]
3817    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3818    #[link_name = "llvm.x86.avx2.packusdw"]
3819    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3820    #[link_name = "llvm.x86.avx2.psad.bw"]
3821    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3822    #[link_name = "llvm.x86.avx2.psign.b"]
3823    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3824    #[link_name = "llvm.x86.avx2.psign.w"]
3825    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3826    #[link_name = "llvm.x86.avx2.psign.d"]
3827    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3828    #[link_name = "llvm.x86.avx2.psll.w"]
3829    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3830    #[link_name = "llvm.x86.avx2.psll.d"]
3831    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3832    #[link_name = "llvm.x86.avx2.psll.q"]
3833    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3834    #[link_name = "llvm.x86.avx2.psllv.d"]
3835    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3836    #[link_name = "llvm.x86.avx2.psllv.d.256"]
3837    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3838    #[link_name = "llvm.x86.avx2.psllv.q"]
3839    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3840    #[link_name = "llvm.x86.avx2.psllv.q.256"]
3841    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3842    #[link_name = "llvm.x86.avx2.psra.w"]
3843    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3844    #[link_name = "llvm.x86.avx2.psra.d"]
3845    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3846    #[link_name = "llvm.x86.avx2.psrav.d"]
3847    fn psravd(a: i32x4, count: i32x4) -> i32x4;
3848    #[link_name = "llvm.x86.avx2.psrav.d.256"]
3849    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3850    #[link_name = "llvm.x86.avx2.psrl.w"]
3851    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3852    #[link_name = "llvm.x86.avx2.psrl.d"]
3853    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3854    #[link_name = "llvm.x86.avx2.psrl.q"]
3855    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3856    #[link_name = "llvm.x86.avx2.psrlv.d"]
3857    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3858    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3859    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3860    #[link_name = "llvm.x86.avx2.psrlv.q"]
3861    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3862    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3863    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3864    #[link_name = "llvm.x86.avx2.pshuf.b"]
3865    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3866    #[link_name = "llvm.x86.avx2.permd"]
3867    fn permd(a: u32x8, b: u32x8) -> u32x8;
3868    #[link_name = "llvm.x86.avx2.permps"]
3869    fn permps(a: __m256, b: i32x8) -> __m256;
3870    #[link_name = "llvm.x86.avx2.vperm2i128"]
3871    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3872    #[link_name = "llvm.x86.avx2.gather.d.d"]
3873    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3874    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3875    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3876    #[link_name = "llvm.x86.avx2.gather.d.q"]
3877    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3878    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3879    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3880    #[link_name = "llvm.x86.avx2.gather.q.d"]
3881    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3882    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3883    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3884    #[link_name = "llvm.x86.avx2.gather.q.q"]
3885    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3886    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3887    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3888    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3889    fn pgatherdpd(
3890        src: __m128d,
3891        slice: *const i8,
3892        offsets: i32x4,
3893        mask: __m128d,
3894        scale: i8,
3895    ) -> __m128d;
3896    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3897    fn vpgatherdpd(
3898        src: __m256d,
3899        slice: *const i8,
3900        offsets: i32x4,
3901        mask: __m256d,
3902        scale: i8,
3903    ) -> __m256d;
3904    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3905    fn pgatherqpd(
3906        src: __m128d,
3907        slice: *const i8,
3908        offsets: i64x2,
3909        mask: __m128d,
3910        scale: i8,
3911    ) -> __m128d;
3912    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3913    fn vpgatherqpd(
3914        src: __m256d,
3915        slice: *const i8,
3916        offsets: i64x4,
3917        mask: __m256d,
3918        scale: i8,
3919    ) -> __m256d;
3920    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3921    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3922    -> __m128;
3923    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3924    fn vpgatherdps(
3925        src: __m256,
3926        slice: *const i8,
3927        offsets: i32x8,
3928        mask: __m256,
3929        scale: i8,
3930    ) -> __m256;
3931    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3932    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3933    -> __m128;
3934    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3935    fn vpgatherqps(
3936        src: __m128,
3937        slice: *const i8,
3938        offsets: i64x4,
3939        mask: __m128,
3940        scale: i8,
3941    ) -> __m128;
3942}
3943
3944#[cfg(test)]
3945mod tests {
3946
3947    use stdarch_test::simd_test;
3948
3949    use crate::core_arch::x86::*;
3950
3951    #[simd_test(enable = "avx2")]
3952    unsafe fn test_mm256_abs_epi32() {
3953        #[rustfmt::skip]
3954        let a = _mm256_setr_epi32(
3955            0, 1, -1, i32::MAX,
3956            i32::MIN, 100, -100, -32,
3957        );
3958        let r = _mm256_abs_epi32(a);
3959        #[rustfmt::skip]
3960        let e = _mm256_setr_epi32(
3961            0, 1, 1, i32::MAX,
3962            i32::MAX.wrapping_add(1), 100, 100, 32,
3963        );
3964        assert_eq_m256i(r, e);
3965    }
3966
3967    #[simd_test(enable = "avx2")]
3968    unsafe fn test_mm256_abs_epi16() {
3969        #[rustfmt::skip]
3970        let a = _mm256_setr_epi16(
3971            0,  1, -1, 2, -2, 3, -3, 4,
3972            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3973        );
3974        let r = _mm256_abs_epi16(a);
3975        #[rustfmt::skip]
3976        let e = _mm256_setr_epi16(
3977            0, 1, 1, 2, 2, 3, 3, 4,
3978            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3979        );
3980        assert_eq_m256i(r, e);
3981    }
3982
3983    #[simd_test(enable = "avx2")]
3984    unsafe fn test_mm256_abs_epi8() {
3985        #[rustfmt::skip]
3986        let a = _mm256_setr_epi8(
3987            0, 1, -1, 2, -2, 3, -3, 4,
3988            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3989            0, 1, -1, 2, -2, 3, -3, 4,
3990            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3991        );
3992        let r = _mm256_abs_epi8(a);
3993        #[rustfmt::skip]
3994        let e = _mm256_setr_epi8(
3995            0, 1, 1, 2, 2, 3, 3, 4,
3996            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3997            0, 1, 1, 2, 2, 3, 3, 4,
3998            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3999        );
4000        assert_eq_m256i(r, e);
4001    }
4002
4003    #[simd_test(enable = "avx2")]
4004    unsafe fn test_mm256_add_epi64() {
4005        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4006        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4007        let r = _mm256_add_epi64(a, b);
4008        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4009        assert_eq_m256i(r, e);
4010    }
4011
4012    #[simd_test(enable = "avx2")]
4013    unsafe fn test_mm256_add_epi32() {
4014        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4015        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4016        let r = _mm256_add_epi32(a, b);
4017        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4018        assert_eq_m256i(r, e);
4019    }
4020
4021    #[simd_test(enable = "avx2")]
4022    unsafe fn test_mm256_add_epi16() {
4023        #[rustfmt::skip]
4024        let a = _mm256_setr_epi16(
4025            0, 1, 2, 3, 4, 5, 6, 7,
4026            8, 9, 10, 11, 12, 13, 14, 15,
4027        );
4028        #[rustfmt::skip]
4029        let b = _mm256_setr_epi16(
4030            0, 1, 2, 3, 4, 5, 6, 7,
4031            8, 9, 10, 11, 12, 13, 14, 15,
4032        );
4033        let r = _mm256_add_epi16(a, b);
4034        #[rustfmt::skip]
4035        let e = _mm256_setr_epi16(
4036            0, 2, 4, 6, 8, 10, 12, 14,
4037            16, 18, 20, 22, 24, 26, 28, 30,
4038        );
4039        assert_eq_m256i(r, e);
4040    }
4041
4042    #[simd_test(enable = "avx2")]
4043    unsafe fn test_mm256_add_epi8() {
4044        #[rustfmt::skip]
4045        let a = _mm256_setr_epi8(
4046            0, 1, 2, 3, 4, 5, 6, 7,
4047            8, 9, 10, 11, 12, 13, 14, 15,
4048            16, 17, 18, 19, 20, 21, 22, 23,
4049            24, 25, 26, 27, 28, 29, 30, 31,
4050        );
4051        #[rustfmt::skip]
4052        let b = _mm256_setr_epi8(
4053            0, 1, 2, 3, 4, 5, 6, 7,
4054            8, 9, 10, 11, 12, 13, 14, 15,
4055            16, 17, 18, 19, 20, 21, 22, 23,
4056            24, 25, 26, 27, 28, 29, 30, 31,
4057        );
4058        let r = _mm256_add_epi8(a, b);
4059        #[rustfmt::skip]
4060        let e = _mm256_setr_epi8(
4061            0, 2, 4, 6, 8, 10, 12, 14,
4062            16, 18, 20, 22, 24, 26, 28, 30,
4063            32, 34, 36, 38, 40, 42, 44, 46,
4064            48, 50, 52, 54, 56, 58, 60, 62,
4065        );
4066        assert_eq_m256i(r, e);
4067    }
4068
4069    #[simd_test(enable = "avx2")]
4070    unsafe fn test_mm256_adds_epi8() {
4071        #[rustfmt::skip]
4072        let a = _mm256_setr_epi8(
4073            0, 1, 2, 3, 4, 5, 6, 7,
4074            8, 9, 10, 11, 12, 13, 14, 15,
4075            16, 17, 18, 19, 20, 21, 22, 23,
4076            24, 25, 26, 27, 28, 29, 30, 31,
4077        );
4078        #[rustfmt::skip]
4079        let b = _mm256_setr_epi8(
4080            32, 33, 34, 35, 36, 37, 38, 39,
4081            40, 41, 42, 43, 44, 45, 46, 47,
4082            48, 49, 50, 51, 52, 53, 54, 55,
4083            56, 57, 58, 59, 60, 61, 62, 63,
4084        );
4085        let r = _mm256_adds_epi8(a, b);
4086        #[rustfmt::skip]
4087        let e = _mm256_setr_epi8(
4088            32, 34, 36, 38, 40, 42, 44, 46,
4089            48, 50, 52, 54, 56, 58, 60, 62,
4090            64, 66, 68, 70, 72, 74, 76, 78,
4091            80, 82, 84, 86, 88, 90, 92, 94,
4092        );
4093        assert_eq_m256i(r, e);
4094    }
4095
4096    #[simd_test(enable = "avx2")]
4097    unsafe fn test_mm256_adds_epi8_saturate_positive() {
4098        let a = _mm256_set1_epi8(0x7F);
4099        let b = _mm256_set1_epi8(1);
4100        let r = _mm256_adds_epi8(a, b);
4101        assert_eq_m256i(r, a);
4102    }
4103
4104    #[simd_test(enable = "avx2")]
4105    unsafe fn test_mm256_adds_epi8_saturate_negative() {
4106        let a = _mm256_set1_epi8(-0x80);
4107        let b = _mm256_set1_epi8(-1);
4108        let r = _mm256_adds_epi8(a, b);
4109        assert_eq_m256i(r, a);
4110    }
4111
4112    #[simd_test(enable = "avx2")]
4113    unsafe fn test_mm256_adds_epi16() {
4114        #[rustfmt::skip]
4115        let a = _mm256_setr_epi16(
4116            0, 1, 2, 3, 4, 5, 6, 7,
4117            8, 9, 10, 11, 12, 13, 14, 15,
4118        );
4119        #[rustfmt::skip]
4120        let b = _mm256_setr_epi16(
4121            32, 33, 34, 35, 36, 37, 38, 39,
4122            40, 41, 42, 43, 44, 45, 46, 47,
4123        );
4124        let r = _mm256_adds_epi16(a, b);
4125        #[rustfmt::skip]
4126        let e = _mm256_setr_epi16(
4127            32, 34, 36, 38, 40, 42, 44, 46,
4128            48, 50, 52, 54, 56, 58, 60, 62,
4129        );
4130
4131        assert_eq_m256i(r, e);
4132    }
4133
4134    #[simd_test(enable = "avx2")]
4135    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4136        let a = _mm256_set1_epi16(0x7FFF);
4137        let b = _mm256_set1_epi16(1);
4138        let r = _mm256_adds_epi16(a, b);
4139        assert_eq_m256i(r, a);
4140    }
4141
4142    #[simd_test(enable = "avx2")]
4143    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4144        let a = _mm256_set1_epi16(-0x8000);
4145        let b = _mm256_set1_epi16(-1);
4146        let r = _mm256_adds_epi16(a, b);
4147        assert_eq_m256i(r, a);
4148    }
4149
4150    #[simd_test(enable = "avx2")]
4151    unsafe fn test_mm256_adds_epu8() {
4152        #[rustfmt::skip]
4153        let a = _mm256_setr_epi8(
4154            0, 1, 2, 3, 4, 5, 6, 7,
4155            8, 9, 10, 11, 12, 13, 14, 15,
4156            16, 17, 18, 19, 20, 21, 22, 23,
4157            24, 25, 26, 27, 28, 29, 30, 31,
4158        );
4159        #[rustfmt::skip]
4160        let b = _mm256_setr_epi8(
4161            32, 33, 34, 35, 36, 37, 38, 39,
4162            40, 41, 42, 43, 44, 45, 46, 47,
4163            48, 49, 50, 51, 52, 53, 54, 55,
4164            56, 57, 58, 59, 60, 61, 62, 63,
4165        );
4166        let r = _mm256_adds_epu8(a, b);
4167        #[rustfmt::skip]
4168        let e = _mm256_setr_epi8(
4169            32, 34, 36, 38, 40, 42, 44, 46,
4170            48, 50, 52, 54, 56, 58, 60, 62,
4171            64, 66, 68, 70, 72, 74, 76, 78,
4172            80, 82, 84, 86, 88, 90, 92, 94,
4173        );
4174        assert_eq_m256i(r, e);
4175    }
4176
4177    #[simd_test(enable = "avx2")]
4178    unsafe fn test_mm256_adds_epu8_saturate() {
4179        let a = _mm256_set1_epi8(!0);
4180        let b = _mm256_set1_epi8(1);
4181        let r = _mm256_adds_epu8(a, b);
4182        assert_eq_m256i(r, a);
4183    }
4184
4185    #[simd_test(enable = "avx2")]
4186    unsafe fn test_mm256_adds_epu16() {
4187        #[rustfmt::skip]
4188        let a = _mm256_setr_epi16(
4189            0, 1, 2, 3, 4, 5, 6, 7,
4190            8, 9, 10, 11, 12, 13, 14, 15,
4191        );
4192        #[rustfmt::skip]
4193        let b = _mm256_setr_epi16(
4194            32, 33, 34, 35, 36, 37, 38, 39,
4195            40, 41, 42, 43, 44, 45, 46, 47,
4196        );
4197        let r = _mm256_adds_epu16(a, b);
4198        #[rustfmt::skip]
4199        let e = _mm256_setr_epi16(
4200            32, 34, 36, 38, 40, 42, 44, 46,
4201            48, 50, 52, 54, 56, 58, 60, 62,
4202        );
4203
4204        assert_eq_m256i(r, e);
4205    }
4206
4207    #[simd_test(enable = "avx2")]
4208    unsafe fn test_mm256_adds_epu16_saturate() {
4209        let a = _mm256_set1_epi16(!0);
4210        let b = _mm256_set1_epi16(1);
4211        let r = _mm256_adds_epu16(a, b);
4212        assert_eq_m256i(r, a);
4213    }
4214
4215    #[simd_test(enable = "avx2")]
4216    unsafe fn test_mm256_and_si256() {
4217        let a = _mm256_set1_epi8(5);
4218        let b = _mm256_set1_epi8(3);
4219        let got = _mm256_and_si256(a, b);
4220        assert_eq_m256i(got, _mm256_set1_epi8(1));
4221    }
4222
4223    #[simd_test(enable = "avx2")]
4224    unsafe fn test_mm256_andnot_si256() {
4225        let a = _mm256_set1_epi8(5);
4226        let b = _mm256_set1_epi8(3);
4227        let got = _mm256_andnot_si256(a, b);
4228        assert_eq_m256i(got, _mm256_set1_epi8(2));
4229    }
4230
4231    #[simd_test(enable = "avx2")]
4232    unsafe fn test_mm256_avg_epu8() {
4233        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4234        let r = _mm256_avg_epu8(a, b);
4235        assert_eq_m256i(r, _mm256_set1_epi8(6));
4236    }
4237
4238    #[simd_test(enable = "avx2")]
4239    unsafe fn test_mm256_avg_epu16() {
4240        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4241        let r = _mm256_avg_epu16(a, b);
4242        assert_eq_m256i(r, _mm256_set1_epi16(6));
4243    }
4244
4245    #[simd_test(enable = "avx2")]
4246    unsafe fn test_mm_blend_epi32() {
4247        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4248        let e = _mm_setr_epi32(9, 3, 3, 3);
4249        let r = _mm_blend_epi32::<0x01>(a, b);
4250        assert_eq_m128i(r, e);
4251
4252        let r = _mm_blend_epi32::<0x0E>(b, a);
4253        assert_eq_m128i(r, e);
4254    }
4255
4256    #[simd_test(enable = "avx2")]
4257    unsafe fn test_mm256_blend_epi32() {
4258        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4259        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4260        let r = _mm256_blend_epi32::<0x01>(a, b);
4261        assert_eq_m256i(r, e);
4262
4263        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4264        let r = _mm256_blend_epi32::<0x82>(a, b);
4265        assert_eq_m256i(r, e);
4266
4267        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4268        let r = _mm256_blend_epi32::<0x7C>(a, b);
4269        assert_eq_m256i(r, e);
4270    }
4271
4272    #[simd_test(enable = "avx2")]
4273    unsafe fn test_mm256_blend_epi16() {
4274        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4275        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4276        let r = _mm256_blend_epi16::<0x01>(a, b);
4277        assert_eq_m256i(r, e);
4278
4279        let r = _mm256_blend_epi16::<0xFE>(b, a);
4280        assert_eq_m256i(r, e);
4281    }
4282
4283    #[simd_test(enable = "avx2")]
4284    unsafe fn test_mm256_blendv_epi8() {
4285        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4286        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4287        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4288        let r = _mm256_blendv_epi8(a, b, mask);
4289        assert_eq_m256i(r, e);
4290    }
4291
4292    #[simd_test(enable = "avx2")]
4293    unsafe fn test_mm_broadcastb_epi8() {
4294        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4295        let res = _mm_broadcastb_epi8(a);
4296        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4297    }
4298
4299    #[simd_test(enable = "avx2")]
4300    unsafe fn test_mm256_broadcastb_epi8() {
4301        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4302        let res = _mm256_broadcastb_epi8(a);
4303        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4304    }
4305
4306    #[simd_test(enable = "avx2")]
4307    unsafe fn test_mm_broadcastd_epi32() {
4308        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4309        let res = _mm_broadcastd_epi32(a);
4310        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4311    }
4312
4313    #[simd_test(enable = "avx2")]
4314    unsafe fn test_mm256_broadcastd_epi32() {
4315        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4316        let res = _mm256_broadcastd_epi32(a);
4317        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4318    }
4319
4320    #[simd_test(enable = "avx2")]
4321    unsafe fn test_mm_broadcastq_epi64() {
4322        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4323        let res = _mm_broadcastq_epi64(a);
4324        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4325    }
4326
4327    #[simd_test(enable = "avx2")]
4328    unsafe fn test_mm256_broadcastq_epi64() {
4329        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4330        let res = _mm256_broadcastq_epi64(a);
4331        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4332    }
4333
4334    #[simd_test(enable = "avx2")]
4335    unsafe fn test_mm_broadcastsd_pd() {
4336        let a = _mm_setr_pd(6.88, 3.44);
4337        let res = _mm_broadcastsd_pd(a);
4338        assert_eq_m128d(res, _mm_set1_pd(6.88));
4339    }
4340
4341    #[simd_test(enable = "avx2")]
4342    unsafe fn test_mm256_broadcastsd_pd() {
4343        let a = _mm_setr_pd(6.88, 3.44);
4344        let res = _mm256_broadcastsd_pd(a);
4345        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4346    }
4347
4348    #[simd_test(enable = "avx2")]
4349    unsafe fn test_mm_broadcastsi128_si256() {
4350        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4351        let res = _mm_broadcastsi128_si256(a);
4352        let retval = _mm256_setr_epi64x(
4353            0x0987654321012334,
4354            0x5678909876543210,
4355            0x0987654321012334,
4356            0x5678909876543210,
4357        );
4358        assert_eq_m256i(res, retval);
4359    }
4360
4361    #[simd_test(enable = "avx2")]
4362    unsafe fn test_mm256_broadcastsi128_si256() {
4363        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4364        let res = _mm256_broadcastsi128_si256(a);
4365        let retval = _mm256_setr_epi64x(
4366            0x0987654321012334,
4367            0x5678909876543210,
4368            0x0987654321012334,
4369            0x5678909876543210,
4370        );
4371        assert_eq_m256i(res, retval);
4372    }
4373
4374    #[simd_test(enable = "avx2")]
4375    unsafe fn test_mm_broadcastss_ps() {
4376        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4377        let res = _mm_broadcastss_ps(a);
4378        assert_eq_m128(res, _mm_set1_ps(6.88));
4379    }
4380
4381    #[simd_test(enable = "avx2")]
4382    unsafe fn test_mm256_broadcastss_ps() {
4383        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4384        let res = _mm256_broadcastss_ps(a);
4385        assert_eq_m256(res, _mm256_set1_ps(6.88));
4386    }
4387
4388    #[simd_test(enable = "avx2")]
4389    unsafe fn test_mm_broadcastw_epi16() {
4390        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4391        let res = _mm_broadcastw_epi16(a);
4392        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4393    }
4394
4395    #[simd_test(enable = "avx2")]
4396    unsafe fn test_mm256_broadcastw_epi16() {
4397        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4398        let res = _mm256_broadcastw_epi16(a);
4399        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4400    }
4401
4402    #[simd_test(enable = "avx2")]
4403    unsafe fn test_mm256_cmpeq_epi8() {
4404        #[rustfmt::skip]
4405        let a = _mm256_setr_epi8(
4406            0, 1, 2, 3, 4, 5, 6, 7,
4407            8, 9, 10, 11, 12, 13, 14, 15,
4408            16, 17, 18, 19, 20, 21, 22, 23,
4409            24, 25, 26, 27, 28, 29, 30, 31,
4410        );
4411        #[rustfmt::skip]
4412        let b = _mm256_setr_epi8(
4413            31, 30, 2, 28, 27, 26, 25, 24,
4414            23, 22, 21, 20, 19, 18, 17, 16,
4415            15, 14, 13, 12, 11, 10, 9, 8,
4416            7, 6, 5, 4, 3, 2, 1, 0,
4417        );
4418        let r = _mm256_cmpeq_epi8(a, b);
4419        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4420    }
4421
4422    #[simd_test(enable = "avx2")]
4423    unsafe fn test_mm256_cmpeq_epi16() {
4424        #[rustfmt::skip]
4425        let a = _mm256_setr_epi16(
4426            0, 1, 2, 3, 4, 5, 6, 7,
4427            8, 9, 10, 11, 12, 13, 14, 15,
4428        );
4429        #[rustfmt::skip]
4430        let b = _mm256_setr_epi16(
4431            15, 14, 2, 12, 11, 10, 9, 8,
4432            7, 6, 5, 4, 3, 2, 1, 0,
4433        );
4434        let r = _mm256_cmpeq_epi16(a, b);
4435        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4436    }
4437
4438    #[simd_test(enable = "avx2")]
4439    unsafe fn test_mm256_cmpeq_epi32() {
4440        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4441        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4442        let r = _mm256_cmpeq_epi32(a, b);
4443        let e = _mm256_set1_epi32(0);
4444        let e = _mm256_insert_epi32::<2>(e, !0);
4445        assert_eq_m256i(r, e);
4446    }
4447
4448    #[simd_test(enable = "avx2")]
4449    unsafe fn test_mm256_cmpeq_epi64() {
4450        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4451        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4452        let r = _mm256_cmpeq_epi64(a, b);
4453        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4454    }
4455
4456    #[simd_test(enable = "avx2")]
4457    unsafe fn test_mm256_cmpgt_epi8() {
4458        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4459        let b = _mm256_set1_epi8(0);
4460        let r = _mm256_cmpgt_epi8(a, b);
4461        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4462    }
4463
4464    #[simd_test(enable = "avx2")]
4465    unsafe fn test_mm256_cmpgt_epi16() {
4466        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4467        let b = _mm256_set1_epi16(0);
4468        let r = _mm256_cmpgt_epi16(a, b);
4469        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4470    }
4471
4472    #[simd_test(enable = "avx2")]
4473    unsafe fn test_mm256_cmpgt_epi32() {
4474        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4475        let b = _mm256_set1_epi32(0);
4476        let r = _mm256_cmpgt_epi32(a, b);
4477        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4478    }
4479
4480    #[simd_test(enable = "avx2")]
4481    unsafe fn test_mm256_cmpgt_epi64() {
4482        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4483        let b = _mm256_set1_epi64x(0);
4484        let r = _mm256_cmpgt_epi64(a, b);
4485        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4486    }
4487
4488    #[simd_test(enable = "avx2")]
4489    unsafe fn test_mm256_cvtepi8_epi16() {
4490        #[rustfmt::skip]
4491        let a = _mm_setr_epi8(
4492            0, 0, -1, 1, -2, 2, -3, 3,
4493            -4, 4, -5, 5, -6, 6, -7, 7,
4494        );
4495        #[rustfmt::skip]
4496        let r = _mm256_setr_epi16(
4497            0, 0, -1, 1, -2, 2, -3, 3,
4498            -4, 4, -5, 5, -6, 6, -7, 7,
4499        );
4500        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4501    }
4502
4503    #[simd_test(enable = "avx2")]
4504    unsafe fn test_mm256_cvtepi8_epi32() {
4505        #[rustfmt::skip]
4506        let a = _mm_setr_epi8(
4507            0, 0, -1, 1, -2, 2, -3, 3,
4508            -4, 4, -5, 5, -6, 6, -7, 7,
4509        );
4510        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4511        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4512    }
4513
4514    #[simd_test(enable = "avx2")]
4515    unsafe fn test_mm256_cvtepi8_epi64() {
4516        #[rustfmt::skip]
4517        let a = _mm_setr_epi8(
4518            0, 0, -1, 1, -2, 2, -3, 3,
4519            -4, 4, -5, 5, -6, 6, -7, 7,
4520        );
4521        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4522        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4523    }
4524
4525    #[simd_test(enable = "avx2")]
4526    unsafe fn test_mm256_cvtepi16_epi32() {
4527        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4528        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4529        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4530    }
4531
4532    #[simd_test(enable = "avx2")]
4533    unsafe fn test_mm256_cvtepi16_epi64() {
4534        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4535        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4536        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4537    }
4538
4539    #[simd_test(enable = "avx2")]
4540    unsafe fn test_mm256_cvtepi32_epi64() {
4541        let a = _mm_setr_epi32(0, 0, -1, 1);
4542        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4543        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4544    }
4545
4546    #[simd_test(enable = "avx2")]
4547    unsafe fn test_mm256_cvtepu16_epi32() {
4548        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4549        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4550        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4551    }
4552
4553    #[simd_test(enable = "avx2")]
4554    unsafe fn test_mm256_cvtepu16_epi64() {
4555        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4556        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4557        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4558    }
4559
4560    #[simd_test(enable = "avx2")]
4561    unsafe fn test_mm256_cvtepu32_epi64() {
4562        let a = _mm_setr_epi32(0, 1, 2, 3);
4563        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4564        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4565    }
4566
4567    #[simd_test(enable = "avx2")]
4568    unsafe fn test_mm256_cvtepu8_epi16() {
4569        #[rustfmt::skip]
4570        let a = _mm_setr_epi8(
4571            0, 1, 2, 3, 4, 5, 6, 7,
4572            8, 9, 10, 11, 12, 13, 14, 15,
4573        );
4574        #[rustfmt::skip]
4575        let r = _mm256_setr_epi16(
4576            0, 1, 2, 3, 4, 5, 6, 7,
4577            8, 9, 10, 11, 12, 13, 14, 15,
4578        );
4579        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4580    }
4581
4582    #[simd_test(enable = "avx2")]
4583    unsafe fn test_mm256_cvtepu8_epi32() {
4584        #[rustfmt::skip]
4585        let a = _mm_setr_epi8(
4586            0, 1, 2, 3, 4, 5, 6, 7,
4587            8, 9, 10, 11, 12, 13, 14, 15,
4588        );
4589        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4590        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4591    }
4592
4593    #[simd_test(enable = "avx2")]
4594    unsafe fn test_mm256_cvtepu8_epi64() {
4595        #[rustfmt::skip]
4596        let a = _mm_setr_epi8(
4597            0, 1, 2, 3, 4, 5, 6, 7,
4598            8, 9, 10, 11, 12, 13, 14, 15,
4599        );
4600        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4601        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4602    }
4603
4604    #[simd_test(enable = "avx2")]
4605    unsafe fn test_mm256_extracti128_si256() {
4606        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4607        let r = _mm256_extracti128_si256::<1>(a);
4608        let e = _mm_setr_epi64x(3, 4);
4609        assert_eq_m128i(r, e);
4610    }
4611
4612    #[simd_test(enable = "avx2")]
4613    unsafe fn test_mm256_hadd_epi16() {
4614        let a = _mm256_set1_epi16(2);
4615        let b = _mm256_set1_epi16(4);
4616        let r = _mm256_hadd_epi16(a, b);
4617        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4618        assert_eq_m256i(r, e);
4619    }
4620
4621    #[simd_test(enable = "avx2")]
4622    unsafe fn test_mm256_hadd_epi32() {
4623        let a = _mm256_set1_epi32(2);
4624        let b = _mm256_set1_epi32(4);
4625        let r = _mm256_hadd_epi32(a, b);
4626        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4627        assert_eq_m256i(r, e);
4628    }
4629
4630    #[simd_test(enable = "avx2")]
4631    unsafe fn test_mm256_hadds_epi16() {
4632        let a = _mm256_set1_epi16(2);
4633        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4634        let a = _mm256_insert_epi16::<1>(a, 1);
4635        let b = _mm256_set1_epi16(4);
4636        let r = _mm256_hadds_epi16(a, b);
4637        #[rustfmt::skip]
4638        let e = _mm256_setr_epi16(
4639            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4640            4, 4, 4, 4, 8, 8, 8, 8,
4641        );
4642        assert_eq_m256i(r, e);
4643    }
4644
4645    #[simd_test(enable = "avx2")]
4646    unsafe fn test_mm256_hsub_epi16() {
4647        let a = _mm256_set1_epi16(2);
4648        let b = _mm256_set1_epi16(4);
4649        let r = _mm256_hsub_epi16(a, b);
4650        let e = _mm256_set1_epi16(0);
4651        assert_eq_m256i(r, e);
4652    }
4653
4654    #[simd_test(enable = "avx2")]
4655    unsafe fn test_mm256_hsub_epi32() {
4656        let a = _mm256_set1_epi32(2);
4657        let b = _mm256_set1_epi32(4);
4658        let r = _mm256_hsub_epi32(a, b);
4659        let e = _mm256_set1_epi32(0);
4660        assert_eq_m256i(r, e);
4661    }
4662
4663    #[simd_test(enable = "avx2")]
4664    unsafe fn test_mm256_hsubs_epi16() {
4665        let a = _mm256_set1_epi16(2);
4666        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4667        let a = _mm256_insert_epi16::<1>(a, -1);
4668        let b = _mm256_set1_epi16(4);
4669        let r = _mm256_hsubs_epi16(a, b);
4670        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4671        assert_eq_m256i(r, e);
4672    }
4673
4674    #[simd_test(enable = "avx2")]
4675    unsafe fn test_mm256_madd_epi16() {
4676        let a = _mm256_set1_epi16(2);
4677        let b = _mm256_set1_epi16(4);
4678        let r = _mm256_madd_epi16(a, b);
4679        let e = _mm256_set1_epi32(16);
4680        assert_eq_m256i(r, e);
4681    }
4682
4683    #[simd_test(enable = "avx2")]
4684    unsafe fn test_mm256_inserti128_si256() {
4685        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4686        let b = _mm_setr_epi64x(7, 8);
4687        let r = _mm256_inserti128_si256::<1>(a, b);
4688        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4689        assert_eq_m256i(r, e);
4690    }
4691
4692    #[simd_test(enable = "avx2")]
4693    unsafe fn test_mm256_maddubs_epi16() {
4694        let a = _mm256_set1_epi8(2);
4695        let b = _mm256_set1_epi8(4);
4696        let r = _mm256_maddubs_epi16(a, b);
4697        let e = _mm256_set1_epi16(16);
4698        assert_eq_m256i(r, e);
4699    }
4700
4701    #[simd_test(enable = "avx2")]
4702    unsafe fn test_mm_maskload_epi32() {
4703        let nums = [1, 2, 3, 4];
4704        let a = &nums as *const i32;
4705        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4706        let r = _mm_maskload_epi32(a, mask);
4707        let e = _mm_setr_epi32(1, 0, 0, 4);
4708        assert_eq_m128i(r, e);
4709    }
4710
4711    #[simd_test(enable = "avx2")]
4712    unsafe fn test_mm256_maskload_epi32() {
4713        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4714        let a = &nums as *const i32;
4715        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4716        let r = _mm256_maskload_epi32(a, mask);
4717        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4718        assert_eq_m256i(r, e);
4719    }
4720
4721    #[simd_test(enable = "avx2")]
4722    unsafe fn test_mm_maskload_epi64() {
4723        let nums = [1_i64, 2_i64];
4724        let a = &nums as *const i64;
4725        let mask = _mm_setr_epi64x(0, -1);
4726        let r = _mm_maskload_epi64(a, mask);
4727        let e = _mm_setr_epi64x(0, 2);
4728        assert_eq_m128i(r, e);
4729    }
4730
4731    #[simd_test(enable = "avx2")]
4732    unsafe fn test_mm256_maskload_epi64() {
4733        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4734        let a = &nums as *const i64;
4735        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4736        let r = _mm256_maskload_epi64(a, mask);
4737        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4738        assert_eq_m256i(r, e);
4739    }
4740
4741    #[simd_test(enable = "avx2")]
4742    unsafe fn test_mm_maskstore_epi32() {
4743        let a = _mm_setr_epi32(1, 2, 3, 4);
4744        let mut arr = [-1, -1, -1, -1];
4745        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4746        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4747        let e = [1, -1, -1, 4];
4748        assert_eq!(arr, e);
4749    }
4750
4751    #[simd_test(enable = "avx2")]
4752    unsafe fn test_mm256_maskstore_epi32() {
4753        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4754        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4755        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4756        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4757        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4758        assert_eq!(arr, e);
4759    }
4760
4761    #[simd_test(enable = "avx2")]
4762    unsafe fn test_mm_maskstore_epi64() {
4763        let a = _mm_setr_epi64x(1_i64, 2_i64);
4764        let mut arr = [-1_i64, -1_i64];
4765        let mask = _mm_setr_epi64x(0, -1);
4766        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4767        let e = [-1, 2];
4768        assert_eq!(arr, e);
4769    }
4770
4771    #[simd_test(enable = "avx2")]
4772    unsafe fn test_mm256_maskstore_epi64() {
4773        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4774        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4775        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4776        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4777        let e = [-1, 2, 3, -1];
4778        assert_eq!(arr, e);
4779    }
4780
4781    #[simd_test(enable = "avx2")]
4782    unsafe fn test_mm256_max_epi16() {
4783        let a = _mm256_set1_epi16(2);
4784        let b = _mm256_set1_epi16(4);
4785        let r = _mm256_max_epi16(a, b);
4786        assert_eq_m256i(r, b);
4787    }
4788
4789    #[simd_test(enable = "avx2")]
4790    unsafe fn test_mm256_max_epi32() {
4791        let a = _mm256_set1_epi32(2);
4792        let b = _mm256_set1_epi32(4);
4793        let r = _mm256_max_epi32(a, b);
4794        assert_eq_m256i(r, b);
4795    }
4796
4797    #[simd_test(enable = "avx2")]
4798    unsafe fn test_mm256_max_epi8() {
4799        let a = _mm256_set1_epi8(2);
4800        let b = _mm256_set1_epi8(4);
4801        let r = _mm256_max_epi8(a, b);
4802        assert_eq_m256i(r, b);
4803    }
4804
4805    #[simd_test(enable = "avx2")]
4806    unsafe fn test_mm256_max_epu16() {
4807        let a = _mm256_set1_epi16(2);
4808        let b = _mm256_set1_epi16(4);
4809        let r = _mm256_max_epu16(a, b);
4810        assert_eq_m256i(r, b);
4811    }
4812
4813    #[simd_test(enable = "avx2")]
4814    unsafe fn test_mm256_max_epu32() {
4815        let a = _mm256_set1_epi32(2);
4816        let b = _mm256_set1_epi32(4);
4817        let r = _mm256_max_epu32(a, b);
4818        assert_eq_m256i(r, b);
4819    }
4820
4821    #[simd_test(enable = "avx2")]
4822    unsafe fn test_mm256_max_epu8() {
4823        let a = _mm256_set1_epi8(2);
4824        let b = _mm256_set1_epi8(4);
4825        let r = _mm256_max_epu8(a, b);
4826        assert_eq_m256i(r, b);
4827    }
4828
4829    #[simd_test(enable = "avx2")]
4830    unsafe fn test_mm256_min_epi16() {
4831        let a = _mm256_set1_epi16(2);
4832        let b = _mm256_set1_epi16(4);
4833        let r = _mm256_min_epi16(a, b);
4834        assert_eq_m256i(r, a);
4835    }
4836
4837    #[simd_test(enable = "avx2")]
4838    unsafe fn test_mm256_min_epi32() {
4839        let a = _mm256_set1_epi32(2);
4840        let b = _mm256_set1_epi32(4);
4841        let r = _mm256_min_epi32(a, b);
4842        assert_eq_m256i(r, a);
4843    }
4844
4845    #[simd_test(enable = "avx2")]
4846    unsafe fn test_mm256_min_epi8() {
4847        let a = _mm256_set1_epi8(2);
4848        let b = _mm256_set1_epi8(4);
4849        let r = _mm256_min_epi8(a, b);
4850        assert_eq_m256i(r, a);
4851    }
4852
4853    #[simd_test(enable = "avx2")]
4854    unsafe fn test_mm256_min_epu16() {
4855        let a = _mm256_set1_epi16(2);
4856        let b = _mm256_set1_epi16(4);
4857        let r = _mm256_min_epu16(a, b);
4858        assert_eq_m256i(r, a);
4859    }
4860
4861    #[simd_test(enable = "avx2")]
4862    unsafe fn test_mm256_min_epu32() {
4863        let a = _mm256_set1_epi32(2);
4864        let b = _mm256_set1_epi32(4);
4865        let r = _mm256_min_epu32(a, b);
4866        assert_eq_m256i(r, a);
4867    }
4868
4869    #[simd_test(enable = "avx2")]
4870    unsafe fn test_mm256_min_epu8() {
4871        let a = _mm256_set1_epi8(2);
4872        let b = _mm256_set1_epi8(4);
4873        let r = _mm256_min_epu8(a, b);
4874        assert_eq_m256i(r, a);
4875    }
4876
4877    #[simd_test(enable = "avx2")]
4878    unsafe fn test_mm256_movemask_epi8() {
4879        let a = _mm256_set1_epi8(-1);
4880        let r = _mm256_movemask_epi8(a);
4881        let e = -1;
4882        assert_eq!(r, e);
4883    }
4884
4885    #[simd_test(enable = "avx2")]
4886    unsafe fn test_mm256_mpsadbw_epu8() {
4887        let a = _mm256_set1_epi8(2);
4888        let b = _mm256_set1_epi8(4);
4889        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4890        let e = _mm256_set1_epi16(8);
4891        assert_eq_m256i(r, e);
4892    }
4893
4894    #[simd_test(enable = "avx2")]
4895    unsafe fn test_mm256_mul_epi32() {
4896        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4897        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4898        let r = _mm256_mul_epi32(a, b);
4899        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4900        assert_eq_m256i(r, e);
4901    }
4902
4903    #[simd_test(enable = "avx2")]
4904    unsafe fn test_mm256_mul_epu32() {
4905        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4906        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4907        let r = _mm256_mul_epu32(a, b);
4908        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4909        assert_eq_m256i(r, e);
4910    }
4911
4912    #[simd_test(enable = "avx2")]
4913    unsafe fn test_mm256_mulhi_epi16() {
4914        let a = _mm256_set1_epi16(6535);
4915        let b = _mm256_set1_epi16(6535);
4916        let r = _mm256_mulhi_epi16(a, b);
4917        let e = _mm256_set1_epi16(651);
4918        assert_eq_m256i(r, e);
4919    }
4920
4921    #[simd_test(enable = "avx2")]
4922    unsafe fn test_mm256_mulhi_epu16() {
4923        let a = _mm256_set1_epi16(6535);
4924        let b = _mm256_set1_epi16(6535);
4925        let r = _mm256_mulhi_epu16(a, b);
4926        let e = _mm256_set1_epi16(651);
4927        assert_eq_m256i(r, e);
4928    }
4929
4930    #[simd_test(enable = "avx2")]
4931    unsafe fn test_mm256_mullo_epi16() {
4932        let a = _mm256_set1_epi16(2);
4933        let b = _mm256_set1_epi16(4);
4934        let r = _mm256_mullo_epi16(a, b);
4935        let e = _mm256_set1_epi16(8);
4936        assert_eq_m256i(r, e);
4937    }
4938
4939    #[simd_test(enable = "avx2")]
4940    unsafe fn test_mm256_mullo_epi32() {
4941        let a = _mm256_set1_epi32(2);
4942        let b = _mm256_set1_epi32(4);
4943        let r = _mm256_mullo_epi32(a, b);
4944        let e = _mm256_set1_epi32(8);
4945        assert_eq_m256i(r, e);
4946    }
4947
4948    #[simd_test(enable = "avx2")]
4949    unsafe fn test_mm256_mulhrs_epi16() {
4950        let a = _mm256_set1_epi16(2);
4951        let b = _mm256_set1_epi16(4);
4952        let r = _mm256_mullo_epi16(a, b);
4953        let e = _mm256_set1_epi16(8);
4954        assert_eq_m256i(r, e);
4955    }
4956
4957    #[simd_test(enable = "avx2")]
4958    unsafe fn test_mm256_or_si256() {
4959        let a = _mm256_set1_epi8(-1);
4960        let b = _mm256_set1_epi8(0);
4961        let r = _mm256_or_si256(a, b);
4962        assert_eq_m256i(r, a);
4963    }
4964
4965    #[simd_test(enable = "avx2")]
4966    unsafe fn test_mm256_packs_epi16() {
4967        let a = _mm256_set1_epi16(2);
4968        let b = _mm256_set1_epi16(4);
4969        let r = _mm256_packs_epi16(a, b);
4970        #[rustfmt::skip]
4971        let e = _mm256_setr_epi8(
4972            2, 2, 2, 2, 2, 2, 2, 2,
4973            4, 4, 4, 4, 4, 4, 4, 4,
4974            2, 2, 2, 2, 2, 2, 2, 2,
4975            4, 4, 4, 4, 4, 4, 4, 4,
4976        );
4977
4978        assert_eq_m256i(r, e);
4979    }
4980
4981    #[simd_test(enable = "avx2")]
4982    unsafe fn test_mm256_packs_epi32() {
4983        let a = _mm256_set1_epi32(2);
4984        let b = _mm256_set1_epi32(4);
4985        let r = _mm256_packs_epi32(a, b);
4986        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4987
4988        assert_eq_m256i(r, e);
4989    }
4990
4991    #[simd_test(enable = "avx2")]
4992    unsafe fn test_mm256_packus_epi16() {
4993        let a = _mm256_set1_epi16(2);
4994        let b = _mm256_set1_epi16(4);
4995        let r = _mm256_packus_epi16(a, b);
4996        #[rustfmt::skip]
4997        let e = _mm256_setr_epi8(
4998            2, 2, 2, 2, 2, 2, 2, 2,
4999            4, 4, 4, 4, 4, 4, 4, 4,
5000            2, 2, 2, 2, 2, 2, 2, 2,
5001            4, 4, 4, 4, 4, 4, 4, 4,
5002        );
5003
5004        assert_eq_m256i(r, e);
5005    }
5006
5007    #[simd_test(enable = "avx2")]
5008    unsafe fn test_mm256_packus_epi32() {
5009        let a = _mm256_set1_epi32(2);
5010        let b = _mm256_set1_epi32(4);
5011        let r = _mm256_packus_epi32(a, b);
5012        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5013
5014        assert_eq_m256i(r, e);
5015    }
5016
5017    #[simd_test(enable = "avx2")]
5018    unsafe fn test_mm256_sad_epu8() {
5019        let a = _mm256_set1_epi8(2);
5020        let b = _mm256_set1_epi8(4);
5021        let r = _mm256_sad_epu8(a, b);
5022        let e = _mm256_set1_epi64x(16);
5023        assert_eq_m256i(r, e);
5024    }
5025
5026    #[simd_test(enable = "avx2")]
5027    unsafe fn test_mm256_shufflehi_epi16() {
5028        #[rustfmt::skip]
5029        let a = _mm256_setr_epi16(
5030            0, 1, 2, 3, 11, 22, 33, 44,
5031            4, 5, 6, 7, 55, 66, 77, 88,
5032        );
5033        #[rustfmt::skip]
5034        let e = _mm256_setr_epi16(
5035            0, 1, 2, 3, 44, 22, 22, 11,
5036            4, 5, 6, 7, 88, 66, 66, 55,
5037        );
5038        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5039        assert_eq_m256i(r, e);
5040    }
5041
5042    #[simd_test(enable = "avx2")]
5043    unsafe fn test_mm256_shufflelo_epi16() {
5044        #[rustfmt::skip]
5045        let a = _mm256_setr_epi16(
5046            11, 22, 33, 44, 0, 1, 2, 3,
5047            55, 66, 77, 88, 4, 5, 6, 7,
5048        );
5049        #[rustfmt::skip]
5050        let e = _mm256_setr_epi16(
5051            44, 22, 22, 11, 0, 1, 2, 3,
5052            88, 66, 66, 55, 4, 5, 6, 7,
5053        );
5054        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5055        assert_eq_m256i(r, e);
5056    }
5057
5058    #[simd_test(enable = "avx2")]
5059    unsafe fn test_mm256_sign_epi16() {
5060        let a = _mm256_set1_epi16(2);
5061        let b = _mm256_set1_epi16(-1);
5062        let r = _mm256_sign_epi16(a, b);
5063        let e = _mm256_set1_epi16(-2);
5064        assert_eq_m256i(r, e);
5065    }
5066
5067    #[simd_test(enable = "avx2")]
5068    unsafe fn test_mm256_sign_epi32() {
5069        let a = _mm256_set1_epi32(2);
5070        let b = _mm256_set1_epi32(-1);
5071        let r = _mm256_sign_epi32(a, b);
5072        let e = _mm256_set1_epi32(-2);
5073        assert_eq_m256i(r, e);
5074    }
5075
5076    #[simd_test(enable = "avx2")]
5077    unsafe fn test_mm256_sign_epi8() {
5078        let a = _mm256_set1_epi8(2);
5079        let b = _mm256_set1_epi8(-1);
5080        let r = _mm256_sign_epi8(a, b);
5081        let e = _mm256_set1_epi8(-2);
5082        assert_eq_m256i(r, e);
5083    }
5084
5085    #[simd_test(enable = "avx2")]
5086    unsafe fn test_mm256_sll_epi16() {
5087        let a = _mm256_set1_epi16(0xFF);
5088        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5089        let r = _mm256_sll_epi16(a, b);
5090        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5091    }
5092
5093    #[simd_test(enable = "avx2")]
5094    unsafe fn test_mm256_sll_epi32() {
5095        let a = _mm256_set1_epi32(0xFFFF);
5096        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5097        let r = _mm256_sll_epi32(a, b);
5098        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5099    }
5100
5101    #[simd_test(enable = "avx2")]
5102    unsafe fn test_mm256_sll_epi64() {
5103        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5104        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5105        let r = _mm256_sll_epi64(a, b);
5106        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5107    }
5108
5109    #[simd_test(enable = "avx2")]
5110    unsafe fn test_mm256_slli_epi16() {
5111        assert_eq_m256i(
5112            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5113            _mm256_set1_epi16(0xFF0),
5114        );
5115    }
5116
5117    #[simd_test(enable = "avx2")]
5118    unsafe fn test_mm256_slli_epi32() {
5119        assert_eq_m256i(
5120            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5121            _mm256_set1_epi32(0xFFFF0),
5122        );
5123    }
5124
5125    #[simd_test(enable = "avx2")]
5126    unsafe fn test_mm256_slli_epi64() {
5127        assert_eq_m256i(
5128            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5129            _mm256_set1_epi64x(0xFFFFFFFF0),
5130        );
5131    }
5132
5133    #[simd_test(enable = "avx2")]
5134    unsafe fn test_mm256_slli_si256() {
5135        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5136        let r = _mm256_slli_si256::<3>(a);
5137        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5138    }
5139
5140    #[simd_test(enable = "avx2")]
5141    unsafe fn test_mm_sllv_epi32() {
5142        let a = _mm_set1_epi32(2);
5143        let b = _mm_set1_epi32(1);
5144        let r = _mm_sllv_epi32(a, b);
5145        let e = _mm_set1_epi32(4);
5146        assert_eq_m128i(r, e);
5147    }
5148
5149    #[simd_test(enable = "avx2")]
5150    unsafe fn test_mm256_sllv_epi32() {
5151        let a = _mm256_set1_epi32(2);
5152        let b = _mm256_set1_epi32(1);
5153        let r = _mm256_sllv_epi32(a, b);
5154        let e = _mm256_set1_epi32(4);
5155        assert_eq_m256i(r, e);
5156    }
5157
5158    #[simd_test(enable = "avx2")]
5159    unsafe fn test_mm_sllv_epi64() {
5160        let a = _mm_set1_epi64x(2);
5161        let b = _mm_set1_epi64x(1);
5162        let r = _mm_sllv_epi64(a, b);
5163        let e = _mm_set1_epi64x(4);
5164        assert_eq_m128i(r, e);
5165    }
5166
5167    #[simd_test(enable = "avx2")]
5168    unsafe fn test_mm256_sllv_epi64() {
5169        let a = _mm256_set1_epi64x(2);
5170        let b = _mm256_set1_epi64x(1);
5171        let r = _mm256_sllv_epi64(a, b);
5172        let e = _mm256_set1_epi64x(4);
5173        assert_eq_m256i(r, e);
5174    }
5175
5176    #[simd_test(enable = "avx2")]
5177    unsafe fn test_mm256_sra_epi16() {
5178        let a = _mm256_set1_epi16(-1);
5179        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5180        let r = _mm256_sra_epi16(a, b);
5181        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5182    }
5183
5184    #[simd_test(enable = "avx2")]
5185    unsafe fn test_mm256_sra_epi32() {
5186        let a = _mm256_set1_epi32(-1);
5187        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5188        let r = _mm256_sra_epi32(a, b);
5189        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5190    }
5191
5192    #[simd_test(enable = "avx2")]
5193    unsafe fn test_mm256_srai_epi16() {
5194        assert_eq_m256i(
5195            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5196            _mm256_set1_epi16(-1),
5197        );
5198    }
5199
5200    #[simd_test(enable = "avx2")]
5201    unsafe fn test_mm256_srai_epi32() {
5202        assert_eq_m256i(
5203            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5204            _mm256_set1_epi32(-1),
5205        );
5206    }
5207
5208    #[simd_test(enable = "avx2")]
5209    unsafe fn test_mm_srav_epi32() {
5210        let a = _mm_set1_epi32(4);
5211        let count = _mm_set1_epi32(1);
5212        let r = _mm_srav_epi32(a, count);
5213        let e = _mm_set1_epi32(2);
5214        assert_eq_m128i(r, e);
5215    }
5216
5217    #[simd_test(enable = "avx2")]
5218    unsafe fn test_mm256_srav_epi32() {
5219        let a = _mm256_set1_epi32(4);
5220        let count = _mm256_set1_epi32(1);
5221        let r = _mm256_srav_epi32(a, count);
5222        let e = _mm256_set1_epi32(2);
5223        assert_eq_m256i(r, e);
5224    }
5225
5226    #[simd_test(enable = "avx2")]
5227    unsafe fn test_mm256_srli_si256() {
5228        #[rustfmt::skip]
5229        let a = _mm256_setr_epi8(
5230            1, 2, 3, 4, 5, 6, 7, 8,
5231            9, 10, 11, 12, 13, 14, 15, 16,
5232            17, 18, 19, 20, 21, 22, 23, 24,
5233            25, 26, 27, 28, 29, 30, 31, 32,
5234        );
5235        let r = _mm256_srli_si256::<3>(a);
5236        #[rustfmt::skip]
5237        let e = _mm256_setr_epi8(
5238            4, 5, 6, 7, 8, 9, 10, 11,
5239            12, 13, 14, 15, 16, 0, 0, 0,
5240            20, 21, 22, 23, 24, 25, 26, 27,
5241            28, 29, 30, 31, 32, 0, 0, 0,
5242        );
5243        assert_eq_m256i(r, e);
5244    }
5245
5246    #[simd_test(enable = "avx2")]
5247    unsafe fn test_mm256_srl_epi16() {
5248        let a = _mm256_set1_epi16(0xFF);
5249        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5250        let r = _mm256_srl_epi16(a, b);
5251        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5252    }
5253
5254    #[simd_test(enable = "avx2")]
5255    unsafe fn test_mm256_srl_epi32() {
5256        let a = _mm256_set1_epi32(0xFFFF);
5257        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5258        let r = _mm256_srl_epi32(a, b);
5259        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5260    }
5261
5262    #[simd_test(enable = "avx2")]
5263    unsafe fn test_mm256_srl_epi64() {
5264        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5265        let b = _mm_setr_epi64x(4, 0);
5266        let r = _mm256_srl_epi64(a, b);
5267        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5268    }
5269
5270    #[simd_test(enable = "avx2")]
5271    unsafe fn test_mm256_srli_epi16() {
5272        assert_eq_m256i(
5273            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5274            _mm256_set1_epi16(0xF),
5275        );
5276    }
5277
5278    #[simd_test(enable = "avx2")]
5279    unsafe fn test_mm256_srli_epi32() {
5280        assert_eq_m256i(
5281            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5282            _mm256_set1_epi32(0xFFF),
5283        );
5284    }
5285
5286    #[simd_test(enable = "avx2")]
5287    unsafe fn test_mm256_srli_epi64() {
5288        assert_eq_m256i(
5289            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5290            _mm256_set1_epi64x(0xFFFFFFF),
5291        );
5292    }
5293
5294    #[simd_test(enable = "avx2")]
5295    unsafe fn test_mm_srlv_epi32() {
5296        let a = _mm_set1_epi32(2);
5297        let count = _mm_set1_epi32(1);
5298        let r = _mm_srlv_epi32(a, count);
5299        let e = _mm_set1_epi32(1);
5300        assert_eq_m128i(r, e);
5301    }
5302
5303    #[simd_test(enable = "avx2")]
5304    unsafe fn test_mm256_srlv_epi32() {
5305        let a = _mm256_set1_epi32(2);
5306        let count = _mm256_set1_epi32(1);
5307        let r = _mm256_srlv_epi32(a, count);
5308        let e = _mm256_set1_epi32(1);
5309        assert_eq_m256i(r, e);
5310    }
5311
5312    #[simd_test(enable = "avx2")]
5313    unsafe fn test_mm_srlv_epi64() {
5314        let a = _mm_set1_epi64x(2);
5315        let count = _mm_set1_epi64x(1);
5316        let r = _mm_srlv_epi64(a, count);
5317        let e = _mm_set1_epi64x(1);
5318        assert_eq_m128i(r, e);
5319    }
5320
5321    #[simd_test(enable = "avx2")]
5322    unsafe fn test_mm256_srlv_epi64() {
5323        let a = _mm256_set1_epi64x(2);
5324        let count = _mm256_set1_epi64x(1);
5325        let r = _mm256_srlv_epi64(a, count);
5326        let e = _mm256_set1_epi64x(1);
5327        assert_eq_m256i(r, e);
5328    }
5329
5330    #[simd_test(enable = "avx2")]
5331    unsafe fn test_mm256_stream_load_si256() {
5332        let a = _mm256_set_epi64x(5, 6, 7, 8);
5333        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5334        assert_eq_m256i(a, r);
5335    }
5336
5337    #[simd_test(enable = "avx2")]
5338    unsafe fn test_mm256_sub_epi16() {
5339        let a = _mm256_set1_epi16(4);
5340        let b = _mm256_set1_epi16(2);
5341        let r = _mm256_sub_epi16(a, b);
5342        assert_eq_m256i(r, b);
5343    }
5344
5345    #[simd_test(enable = "avx2")]
5346    unsafe fn test_mm256_sub_epi32() {
5347        let a = _mm256_set1_epi32(4);
5348        let b = _mm256_set1_epi32(2);
5349        let r = _mm256_sub_epi32(a, b);
5350        assert_eq_m256i(r, b);
5351    }
5352
5353    #[simd_test(enable = "avx2")]
5354    unsafe fn test_mm256_sub_epi64() {
5355        let a = _mm256_set1_epi64x(4);
5356        let b = _mm256_set1_epi64x(2);
5357        let r = _mm256_sub_epi64(a, b);
5358        assert_eq_m256i(r, b);
5359    }
5360
5361    #[simd_test(enable = "avx2")]
5362    unsafe fn test_mm256_sub_epi8() {
5363        let a = _mm256_set1_epi8(4);
5364        let b = _mm256_set1_epi8(2);
5365        let r = _mm256_sub_epi8(a, b);
5366        assert_eq_m256i(r, b);
5367    }
5368
5369    #[simd_test(enable = "avx2")]
5370    unsafe fn test_mm256_subs_epi16() {
5371        let a = _mm256_set1_epi16(4);
5372        let b = _mm256_set1_epi16(2);
5373        let r = _mm256_subs_epi16(a, b);
5374        assert_eq_m256i(r, b);
5375    }
5376
5377    #[simd_test(enable = "avx2")]
5378    unsafe fn test_mm256_subs_epi8() {
5379        let a = _mm256_set1_epi8(4);
5380        let b = _mm256_set1_epi8(2);
5381        let r = _mm256_subs_epi8(a, b);
5382        assert_eq_m256i(r, b);
5383    }
5384
5385    #[simd_test(enable = "avx2")]
5386    unsafe fn test_mm256_subs_epu16() {
5387        let a = _mm256_set1_epi16(4);
5388        let b = _mm256_set1_epi16(2);
5389        let r = _mm256_subs_epu16(a, b);
5390        assert_eq_m256i(r, b);
5391    }
5392
5393    #[simd_test(enable = "avx2")]
5394    unsafe fn test_mm256_subs_epu8() {
5395        let a = _mm256_set1_epi8(4);
5396        let b = _mm256_set1_epi8(2);
5397        let r = _mm256_subs_epu8(a, b);
5398        assert_eq_m256i(r, b);
5399    }
5400
5401    #[simd_test(enable = "avx2")]
5402    unsafe fn test_mm256_xor_si256() {
5403        let a = _mm256_set1_epi8(5);
5404        let b = _mm256_set1_epi8(3);
5405        let r = _mm256_xor_si256(a, b);
5406        assert_eq_m256i(r, _mm256_set1_epi8(6));
5407    }
5408
5409    #[simd_test(enable = "avx2")]
5410    unsafe fn test_mm256_alignr_epi8() {
5411        #[rustfmt::skip]
5412        let a = _mm256_setr_epi8(
5413            1, 2, 3, 4, 5, 6, 7, 8,
5414            9, 10, 11, 12, 13, 14, 15, 16,
5415            17, 18, 19, 20, 21, 22, 23, 24,
5416            25, 26, 27, 28, 29, 30, 31, 32,
5417        );
5418        #[rustfmt::skip]
5419        let b = _mm256_setr_epi8(
5420            -1, -2, -3, -4, -5, -6, -7, -8,
5421            -9, -10, -11, -12, -13, -14, -15, -16,
5422            -17, -18, -19, -20, -21, -22, -23, -24,
5423            -25, -26, -27, -28, -29, -30, -31, -32,
5424        );
5425        let r = _mm256_alignr_epi8::<33>(a, b);
5426        assert_eq_m256i(r, _mm256_set1_epi8(0));
5427
5428        let r = _mm256_alignr_epi8::<17>(a, b);
5429        #[rustfmt::skip]
5430        let expected = _mm256_setr_epi8(
5431            2, 3, 4, 5, 6, 7, 8, 9,
5432            10, 11, 12, 13, 14, 15, 16, 0,
5433            18, 19, 20, 21, 22, 23, 24, 25,
5434            26, 27, 28, 29, 30, 31, 32, 0,
5435        );
5436        assert_eq_m256i(r, expected);
5437
5438        let r = _mm256_alignr_epi8::<4>(a, b);
5439        #[rustfmt::skip]
5440        let expected = _mm256_setr_epi8(
5441            -5, -6, -7, -8, -9, -10, -11, -12,
5442            -13, -14, -15, -16, 1, 2, 3, 4,
5443            -21, -22, -23, -24, -25, -26, -27, -28,
5444            -29, -30, -31, -32, 17, 18, 19, 20,
5445        );
5446        assert_eq_m256i(r, expected);
5447
5448        let r = _mm256_alignr_epi8::<15>(a, b);
5449        #[rustfmt::skip]
5450        let expected = _mm256_setr_epi8(
5451            -16, 1, 2, 3, 4, 5, 6, 7,
5452            8, 9, 10, 11, 12, 13, 14, 15,
5453            -32, 17, 18, 19, 20, 21, 22, 23,
5454            24, 25, 26, 27, 28, 29, 30, 31,
5455        );
5456        assert_eq_m256i(r, expected);
5457
5458        let r = _mm256_alignr_epi8::<0>(a, b);
5459        assert_eq_m256i(r, b);
5460
5461        let r = _mm256_alignr_epi8::<16>(a, b);
5462        assert_eq_m256i(r, a);
5463    }
5464
5465    #[simd_test(enable = "avx2")]
5466    unsafe fn test_mm256_shuffle_epi8() {
5467        #[rustfmt::skip]
5468        let a = _mm256_setr_epi8(
5469            1, 2, 3, 4, 5, 6, 7, 8,
5470            9, 10, 11, 12, 13, 14, 15, 16,
5471            17, 18, 19, 20, 21, 22, 23, 24,
5472            25, 26, 27, 28, 29, 30, 31, 32,
5473        );
5474        #[rustfmt::skip]
5475        let b = _mm256_setr_epi8(
5476            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5477            12, 5, 5, 10, 4, 1, 8, 0,
5478            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5479            12, 5, 5, 10, 4, 1, 8, 0,
5480        );
5481        #[rustfmt::skip]
5482        let expected = _mm256_setr_epi8(
5483            5, 0, 5, 4, 9, 13, 7, 4,
5484            13, 6, 6, 11, 5, 2, 9, 1,
5485            21, 0, 21, 20, 25, 29, 23, 20,
5486            29, 22, 22, 27, 21, 18, 25, 17,
5487        );
5488        let r = _mm256_shuffle_epi8(a, b);
5489        assert_eq_m256i(r, expected);
5490    }
5491
5492    #[simd_test(enable = "avx2")]
5493    unsafe fn test_mm256_permutevar8x32_epi32() {
5494        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5495        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5496        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5497        let r = _mm256_permutevar8x32_epi32(a, b);
5498        assert_eq_m256i(r, expected);
5499    }
5500
5501    #[simd_test(enable = "avx2")]
5502    unsafe fn test_mm256_permute4x64_epi64() {
5503        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5504        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5505        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5506        assert_eq_m256i(r, expected);
5507    }
5508
5509    #[simd_test(enable = "avx2")]
5510    unsafe fn test_mm256_permute2x128_si256() {
5511        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5512        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5513        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5514        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5515        assert_eq_m256i(r, e);
5516    }
5517
5518    #[simd_test(enable = "avx2")]
5519    unsafe fn test_mm256_permute4x64_pd() {
5520        let a = _mm256_setr_pd(1., 2., 3., 4.);
5521        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5522        let e = _mm256_setr_pd(4., 1., 2., 1.);
5523        assert_eq_m256d(r, e);
5524    }
5525
5526    #[simd_test(enable = "avx2")]
5527    unsafe fn test_mm256_permutevar8x32_ps() {
5528        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5529        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5530        let r = _mm256_permutevar8x32_ps(a, b);
5531        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5532        assert_eq_m256(r, e);
5533    }
5534
5535    #[simd_test(enable = "avx2")]
5536    unsafe fn test_mm_i32gather_epi32() {
5537        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5538        // A multiplier of 4 is word-addressing
5539        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5540        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5541    }
5542
5543    #[simd_test(enable = "avx2")]
5544    unsafe fn test_mm_mask_i32gather_epi32() {
5545        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5546        // A multiplier of 4 is word-addressing
5547        let r = _mm_mask_i32gather_epi32::<4>(
5548            _mm_set1_epi32(256),
5549            arr.as_ptr(),
5550            _mm_setr_epi32(0, 16, 64, 96),
5551            _mm_setr_epi32(-1, -1, -1, 0),
5552        );
5553        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5554    }
5555
5556    #[simd_test(enable = "avx2")]
5557    unsafe fn test_mm256_i32gather_epi32() {
5558        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5559        // A multiplier of 4 is word-addressing
5560        let r =
5561            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5562        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5563    }
5564
5565    #[simd_test(enable = "avx2")]
5566    unsafe fn test_mm256_mask_i32gather_epi32() {
5567        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5568        // A multiplier of 4 is word-addressing
5569        let r = _mm256_mask_i32gather_epi32::<4>(
5570            _mm256_set1_epi32(256),
5571            arr.as_ptr(),
5572            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5573            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5574        );
5575        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5576    }
5577
5578    #[simd_test(enable = "avx2")]
5579    unsafe fn test_mm_i32gather_ps() {
5580        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5581        // A multiplier of 4 is word-addressing for f32s
5582        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5583        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5584    }
5585
5586    #[simd_test(enable = "avx2")]
5587    unsafe fn test_mm_mask_i32gather_ps() {
5588        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5589        // A multiplier of 4 is word-addressing for f32s
5590        let r = _mm_mask_i32gather_ps::<4>(
5591            _mm_set1_ps(256.0),
5592            arr.as_ptr(),
5593            _mm_setr_epi32(0, 16, 64, 96),
5594            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5595        );
5596        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5597    }
5598
5599    #[simd_test(enable = "avx2")]
5600    unsafe fn test_mm256_i32gather_ps() {
5601        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5602        // A multiplier of 4 is word-addressing for f32s
5603        let r =
5604            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5605        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5606    }
5607
5608    #[simd_test(enable = "avx2")]
5609    unsafe fn test_mm256_mask_i32gather_ps() {
5610        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5611        // A multiplier of 4 is word-addressing for f32s
5612        let r = _mm256_mask_i32gather_ps::<4>(
5613            _mm256_set1_ps(256.0),
5614            arr.as_ptr(),
5615            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5616            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5617        );
5618        assert_eq_m256(
5619            r,
5620            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5621        );
5622    }
5623
5624    #[simd_test(enable = "avx2")]
5625    unsafe fn test_mm_i32gather_epi64() {
5626        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5627        // A multiplier of 8 is word-addressing for i64s
5628        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5629        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5630    }
5631
5632    #[simd_test(enable = "avx2")]
5633    unsafe fn test_mm_mask_i32gather_epi64() {
5634        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5635        // A multiplier of 8 is word-addressing for i64s
5636        let r = _mm_mask_i32gather_epi64::<8>(
5637            _mm_set1_epi64x(256),
5638            arr.as_ptr(),
5639            _mm_setr_epi32(16, 16, 16, 16),
5640            _mm_setr_epi64x(-1, 0),
5641        );
5642        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5643    }
5644
5645    #[simd_test(enable = "avx2")]
5646    unsafe fn test_mm256_i32gather_epi64() {
5647        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5648        // A multiplier of 8 is word-addressing for i64s
5649        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5650        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5651    }
5652
5653    #[simd_test(enable = "avx2")]
5654    unsafe fn test_mm256_mask_i32gather_epi64() {
5655        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5656        // A multiplier of 8 is word-addressing for i64s
5657        let r = _mm256_mask_i32gather_epi64::<8>(
5658            _mm256_set1_epi64x(256),
5659            arr.as_ptr(),
5660            _mm_setr_epi32(0, 16, 64, 96),
5661            _mm256_setr_epi64x(-1, -1, -1, 0),
5662        );
5663        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5664    }
5665
5666    #[simd_test(enable = "avx2")]
5667    unsafe fn test_mm_i32gather_pd() {
5668        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5669        // A multiplier of 8 is word-addressing for f64s
5670        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5671        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5672    }
5673
5674    #[simd_test(enable = "avx2")]
5675    unsafe fn test_mm_mask_i32gather_pd() {
5676        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5677        // A multiplier of 8 is word-addressing for f64s
5678        let r = _mm_mask_i32gather_pd::<8>(
5679            _mm_set1_pd(256.0),
5680            arr.as_ptr(),
5681            _mm_setr_epi32(16, 16, 16, 16),
5682            _mm_setr_pd(-1.0, 0.0),
5683        );
5684        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5685    }
5686
5687    #[simd_test(enable = "avx2")]
5688    unsafe fn test_mm256_i32gather_pd() {
5689        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5690        // A multiplier of 8 is word-addressing for f64s
5691        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5692        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5693    }
5694
5695    #[simd_test(enable = "avx2")]
5696    unsafe fn test_mm256_mask_i32gather_pd() {
5697        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5698        // A multiplier of 8 is word-addressing for f64s
5699        let r = _mm256_mask_i32gather_pd::<8>(
5700            _mm256_set1_pd(256.0),
5701            arr.as_ptr(),
5702            _mm_setr_epi32(0, 16, 64, 96),
5703            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5704        );
5705        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5706    }
5707
5708    #[simd_test(enable = "avx2")]
5709    unsafe fn test_mm_i64gather_epi32() {
5710        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5711        // A multiplier of 4 is word-addressing
5712        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5713        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5714    }
5715
5716    #[simd_test(enable = "avx2")]
5717    unsafe fn test_mm_mask_i64gather_epi32() {
5718        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5719        // A multiplier of 4 is word-addressing
5720        let r = _mm_mask_i64gather_epi32::<4>(
5721            _mm_set1_epi32(256),
5722            arr.as_ptr(),
5723            _mm_setr_epi64x(0, 16),
5724            _mm_setr_epi32(-1, 0, -1, 0),
5725        );
5726        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5727    }
5728
5729    #[simd_test(enable = "avx2")]
5730    unsafe fn test_mm256_i64gather_epi32() {
5731        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5732        // A multiplier of 4 is word-addressing
5733        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5734        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5735    }
5736
5737    #[simd_test(enable = "avx2")]
5738    unsafe fn test_mm256_mask_i64gather_epi32() {
5739        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5740        // A multiplier of 4 is word-addressing
5741        let r = _mm256_mask_i64gather_epi32::<4>(
5742            _mm_set1_epi32(256),
5743            arr.as_ptr(),
5744            _mm256_setr_epi64x(0, 16, 64, 96),
5745            _mm_setr_epi32(-1, -1, -1, 0),
5746        );
5747        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5748    }
5749
5750    #[simd_test(enable = "avx2")]
5751    unsafe fn test_mm_i64gather_ps() {
5752        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5753        // A multiplier of 4 is word-addressing for f32s
5754        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5755        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5756    }
5757
5758    #[simd_test(enable = "avx2")]
5759    unsafe fn test_mm_mask_i64gather_ps() {
5760        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5761        // A multiplier of 4 is word-addressing for f32s
5762        let r = _mm_mask_i64gather_ps::<4>(
5763            _mm_set1_ps(256.0),
5764            arr.as_ptr(),
5765            _mm_setr_epi64x(0, 16),
5766            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5767        );
5768        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5769    }
5770
5771    #[simd_test(enable = "avx2")]
5772    unsafe fn test_mm256_i64gather_ps() {
5773        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5774        // A multiplier of 4 is word-addressing for f32s
5775        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5776        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5777    }
5778
5779    #[simd_test(enable = "avx2")]
5780    unsafe fn test_mm256_mask_i64gather_ps() {
5781        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5782        // A multiplier of 4 is word-addressing for f32s
5783        let r = _mm256_mask_i64gather_ps::<4>(
5784            _mm_set1_ps(256.0),
5785            arr.as_ptr(),
5786            _mm256_setr_epi64x(0, 16, 64, 96),
5787            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5788        );
5789        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5790    }
5791
5792    #[simd_test(enable = "avx2")]
5793    unsafe fn test_mm_i64gather_epi64() {
5794        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5795        // A multiplier of 8 is word-addressing for i64s
5796        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5797        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5798    }
5799
5800    #[simd_test(enable = "avx2")]
5801    unsafe fn test_mm_mask_i64gather_epi64() {
5802        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5803        // A multiplier of 8 is word-addressing for i64s
5804        let r = _mm_mask_i64gather_epi64::<8>(
5805            _mm_set1_epi64x(256),
5806            arr.as_ptr(),
5807            _mm_setr_epi64x(16, 16),
5808            _mm_setr_epi64x(-1, 0),
5809        );
5810        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5811    }
5812
5813    #[simd_test(enable = "avx2")]
5814    unsafe fn test_mm256_i64gather_epi64() {
5815        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5816        // A multiplier of 8 is word-addressing for i64s
5817        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5818        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5819    }
5820
5821    #[simd_test(enable = "avx2")]
5822    unsafe fn test_mm256_mask_i64gather_epi64() {
5823        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5824        // A multiplier of 8 is word-addressing for i64s
5825        let r = _mm256_mask_i64gather_epi64::<8>(
5826            _mm256_set1_epi64x(256),
5827            arr.as_ptr(),
5828            _mm256_setr_epi64x(0, 16, 64, 96),
5829            _mm256_setr_epi64x(-1, -1, -1, 0),
5830        );
5831        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5832    }
5833
5834    #[simd_test(enable = "avx2")]
5835    unsafe fn test_mm_i64gather_pd() {
5836        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5837        // A multiplier of 8 is word-addressing for f64s
5838        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5839        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5840    }
5841
5842    #[simd_test(enable = "avx2")]
5843    unsafe fn test_mm_mask_i64gather_pd() {
5844        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5845        // A multiplier of 8 is word-addressing for f64s
5846        let r = _mm_mask_i64gather_pd::<8>(
5847            _mm_set1_pd(256.0),
5848            arr.as_ptr(),
5849            _mm_setr_epi64x(16, 16),
5850            _mm_setr_pd(-1.0, 0.0),
5851        );
5852        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5853    }
5854
5855    #[simd_test(enable = "avx2")]
5856    unsafe fn test_mm256_i64gather_pd() {
5857        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5858        // A multiplier of 8 is word-addressing for f64s
5859        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5860        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5861    }
5862
5863    #[simd_test(enable = "avx2")]
5864    unsafe fn test_mm256_mask_i64gather_pd() {
5865        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5866        // A multiplier of 8 is word-addressing for f64s
5867        let r = _mm256_mask_i64gather_pd::<8>(
5868            _mm256_set1_pd(256.0),
5869            arr.as_ptr(),
5870            _mm256_setr_epi64x(0, 16, 64, 96),
5871            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5872        );
5873        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5874    }
5875
5876    #[simd_test(enable = "avx")]
5877    unsafe fn test_mm256_extract_epi8() {
5878        #[rustfmt::skip]
5879        let a = _mm256_setr_epi8(
5880            -1, 1, 2, 3, 4, 5, 6, 7,
5881            8, 9, 10, 11, 12, 13, 14, 15,
5882            16, 17, 18, 19, 20, 21, 22, 23,
5883            24, 25, 26, 27, 28, 29, 30, 31
5884        );
5885        let r1 = _mm256_extract_epi8::<0>(a);
5886        let r2 = _mm256_extract_epi8::<3>(a);
5887        assert_eq!(r1, 0xFF);
5888        assert_eq!(r2, 3);
5889    }
5890
5891    #[simd_test(enable = "avx2")]
5892    unsafe fn test_mm256_extract_epi16() {
5893        #[rustfmt::skip]
5894        let a = _mm256_setr_epi16(
5895            -1, 1, 2, 3, 4, 5, 6, 7,
5896            8, 9, 10, 11, 12, 13, 14, 15,
5897        );
5898        let r1 = _mm256_extract_epi16::<0>(a);
5899        let r2 = _mm256_extract_epi16::<3>(a);
5900        assert_eq!(r1, 0xFFFF);
5901        assert_eq!(r2, 3);
5902    }
5903}