Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm256_abs_epi32(a: __m256i) -> __m256i {
36    unsafe {
37        let a = a.as_i32x8();
38        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39        transmute(r)
40    }
41}
42
43/// Computes the absolute values of packed 16-bit integers in `a`.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
46#[inline]
47#[target_feature(enable = "avx2")]
48#[cfg_attr(test, assert_instr(vpabsw))]
49#[stable(feature = "simd_x86", since = "1.27.0")]
50#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
51pub const fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
67pub const fn _mm256_abs_epi8(a: __m256i) -> __m256i {
68    unsafe {
69        let a = a.as_i8x32();
70        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
71        transmute(r)
72    }
73}
74
75/// Adds packed 64-bit integers in `a` and `b`.
76///
77/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
78#[inline]
79#[target_feature(enable = "avx2")]
80#[cfg_attr(test, assert_instr(vpaddq))]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
84    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
85}
86
87/// Adds packed 32-bit integers in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
90#[inline]
91#[target_feature(enable = "avx2")]
92#[cfg_attr(test, assert_instr(vpaddd))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
95pub const fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
96    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
97}
98
99/// Adds packed 16-bit integers in `a` and `b`.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
102#[inline]
103#[target_feature(enable = "avx2")]
104#[cfg_attr(test, assert_instr(vpaddw))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
107pub const fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
108    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
109}
110
111/// Adds packed 8-bit integers in `a` and `b`.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
114#[inline]
115#[target_feature(enable = "avx2")]
116#[cfg_attr(test, assert_instr(vpaddb))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
119pub const fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
120    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
121}
122
123/// Adds packed 8-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsb))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
132    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
133}
134
135/// Adds packed 16-bit integers in `a` and `b` using saturation.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
138#[inline]
139#[target_feature(enable = "avx2")]
140#[cfg_attr(test, assert_instr(vpaddsw))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
143pub const fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
144    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
145}
146
147/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
150#[inline]
151#[target_feature(enable = "avx2")]
152#[cfg_attr(test, assert_instr(vpaddusb))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
156    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
157}
158
159/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
162#[inline]
163#[target_feature(enable = "avx2")]
164#[cfg_attr(test, assert_instr(vpaddusw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
167pub const fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
168    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
169}
170
171/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
172/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
173///
174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
175#[inline]
176#[target_feature(enable = "avx2")]
177#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
178#[rustc_legacy_const_generics(2)]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
181pub const fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
182    static_assert_uimm_bits!(IMM8, 8);
183
184    // If palignr is shifting the pair of vectors more than the size of two
185    // lanes, emit zero.
186    if IMM8 >= 32 {
187        return _mm256_setzero_si256();
188    }
189    // If palignr is shifting the pair of input vectors more than one lane,
190    // but less than two lanes, convert to shifting in zeroes.
191    let (a, b) = if IMM8 > 16 {
192        (_mm256_setzero_si256(), a)
193    } else {
194        (a, b)
195    };
196    unsafe {
197        if IMM8 == 16 {
198            return transmute(a);
199        }
200    }
201    const fn mask(shift: u32, i: u32) -> u32 {
202        let shift = shift % 16;
203        let mod_i = i % 16;
204        if mod_i < (16 - shift) {
205            i + shift
206        } else {
207            i + 16 + shift
208        }
209    }
210
211    unsafe {
212        let r: i8x32 = simd_shuffle!(
213            b.as_i8x32(),
214            a.as_i8x32(),
215            [
216                mask(IMM8 as u32, 0),
217                mask(IMM8 as u32, 1),
218                mask(IMM8 as u32, 2),
219                mask(IMM8 as u32, 3),
220                mask(IMM8 as u32, 4),
221                mask(IMM8 as u32, 5),
222                mask(IMM8 as u32, 6),
223                mask(IMM8 as u32, 7),
224                mask(IMM8 as u32, 8),
225                mask(IMM8 as u32, 9),
226                mask(IMM8 as u32, 10),
227                mask(IMM8 as u32, 11),
228                mask(IMM8 as u32, 12),
229                mask(IMM8 as u32, 13),
230                mask(IMM8 as u32, 14),
231                mask(IMM8 as u32, 15),
232                mask(IMM8 as u32, 16),
233                mask(IMM8 as u32, 17),
234                mask(IMM8 as u32, 18),
235                mask(IMM8 as u32, 19),
236                mask(IMM8 as u32, 20),
237                mask(IMM8 as u32, 21),
238                mask(IMM8 as u32, 22),
239                mask(IMM8 as u32, 23),
240                mask(IMM8 as u32, 24),
241                mask(IMM8 as u32, 25),
242                mask(IMM8 as u32, 26),
243                mask(IMM8 as u32, 27),
244                mask(IMM8 as u32, 28),
245                mask(IMM8 as u32, 29),
246                mask(IMM8 as u32, 30),
247                mask(IMM8 as u32, 31),
248            ],
249        );
250        transmute(r)
251    }
252}
253
254/// Computes the bitwise AND of 256 bits (representing integer data)
255/// in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
264    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
265}
266
267/// Computes the bitwise NOT of 256 bits (representing integer data)
268/// in `a` and then AND with `b`.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
271#[inline]
272#[target_feature(enable = "avx2")]
273#[cfg_attr(test, assert_instr(vandnps))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
276pub const fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
277    unsafe {
278        let all_ones = _mm256_set1_epi8(-1);
279        transmute(simd_and(
280            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
281            b.as_i64x4(),
282        ))
283    }
284}
285
286/// Averages packed unsigned 16-bit integers in `a` and `b`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
289#[inline]
290#[target_feature(enable = "avx2")]
291#[cfg_attr(test, assert_instr(vpavgw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
294pub const fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
295    unsafe {
296        let a = simd_cast::<_, u32x16>(a.as_u16x16());
297        let b = simd_cast::<_, u32x16>(b.as_u16x16());
298        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
299        transmute(simd_cast::<_, u16x16>(r))
300    }
301}
302
303/// Averages packed unsigned 8-bit integers in `a` and `b`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
306#[inline]
307#[target_feature(enable = "avx2")]
308#[cfg_attr(test, assert_instr(vpavgb))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
311pub const fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
312    unsafe {
313        let a = simd_cast::<_, u16x32>(a.as_u8x32());
314        let b = simd_cast::<_, u16x32>(b.as_u8x32());
315        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
316        transmute(simd_cast::<_, u8x32>(r))
317    }
318}
319
320/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
321///
322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
323#[inline]
324#[target_feature(enable = "avx2")]
325#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
326#[rustc_legacy_const_generics(2)]
327#[stable(feature = "simd_x86", since = "1.27.0")]
328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
329pub const fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
330    static_assert_uimm_bits!(IMM4, 4);
331    unsafe {
332        let a = a.as_i32x4();
333        let b = b.as_i32x4();
334        let r: i32x4 = simd_shuffle!(
335            a,
336            b,
337            [
338                [0, 4, 0, 4][IMM4 as usize & 0b11],
339                [1, 1, 5, 5][IMM4 as usize & 0b11],
340                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
341                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
342            ],
343        );
344        transmute(r)
345    }
346}
347
348/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
351#[inline]
352#[target_feature(enable = "avx2")]
353#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
354#[rustc_legacy_const_generics(2)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
357pub const fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
358    static_assert_uimm_bits!(IMM8, 8);
359    unsafe {
360        let a = a.as_i32x8();
361        let b = b.as_i32x8();
362        let r: i32x8 = simd_shuffle!(
363            a,
364            b,
365            [
366                [0, 8, 0, 8][IMM8 as usize & 0b11],
367                [1, 1, 9, 9][IMM8 as usize & 0b11],
368                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
369                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
370                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
371                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
372                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
373                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
374            ],
375        );
376        transmute(r)
377    }
378}
379
380/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
383#[inline]
384#[target_feature(enable = "avx2")]
385#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
386#[rustc_legacy_const_generics(2)]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
389pub const fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
390    static_assert_uimm_bits!(IMM8, 8);
391    unsafe {
392        let a = a.as_i16x16();
393        let b = b.as_i16x16();
394
395        let r: i16x16 = simd_shuffle!(
396            a,
397            b,
398            [
399                [0, 16, 0, 16][IMM8 as usize & 0b11],
400                [1, 1, 17, 17][IMM8 as usize & 0b11],
401                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
402                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
403                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
404                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
405                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
406                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
407                [8, 24, 8, 24][IMM8 as usize & 0b11],
408                [9, 9, 25, 25][IMM8 as usize & 0b11],
409                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
410                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
411                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
412                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
413                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
414                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
415            ],
416        );
417        transmute(r)
418    }
419}
420
421/// Blends packed 8-bit integers from `a` and `b` using `mask`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
424#[inline]
425#[target_feature(enable = "avx2")]
426#[cfg_attr(test, assert_instr(vpblendvb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
430    unsafe {
431        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
432        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
433    }
434}
435
436/// Broadcasts the low packed 8-bit integer from `a` to all elements of
437/// the 128-bit returned value.
438///
439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
440#[inline]
441#[target_feature(enable = "avx2")]
442#[cfg_attr(test, assert_instr(vpbroadcastb))]
443#[stable(feature = "simd_x86", since = "1.27.0")]
444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
445pub const fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
446    unsafe {
447        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
448        transmute::<i8x16, _>(ret)
449    }
450}
451
452/// Broadcasts the low packed 8-bit integer from `a` to all elements of
453/// the 256-bit returned value.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
456#[inline]
457#[target_feature(enable = "avx2")]
458#[cfg_attr(test, assert_instr(vpbroadcastb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
462    unsafe {
463        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
464        transmute::<i8x32, _>(ret)
465    }
466}
467
468// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
469// often compiled to `vbroadcastss`.
470/// Broadcasts the low packed 32-bit integer from `a` to all elements of
471/// the 128-bit returned value.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
474#[inline]
475#[target_feature(enable = "avx2")]
476#[cfg_attr(test, assert_instr(vbroadcastss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
479pub const fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
482        transmute::<i32x4, _>(ret)
483    }
484}
485
486// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
487// often compiled to `vbroadcastss`.
488/// Broadcasts the low packed 32-bit integer from `a` to all elements of
489/// the 256-bit returned value.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vbroadcastss))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497pub const fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
498    unsafe {
499        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
500        transmute::<i32x8, _>(ret)
501    }
502}
503
504/// Broadcasts the low packed 64-bit integer from `a` to all elements of
505/// the 128-bit returned value.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
508#[inline]
509#[target_feature(enable = "avx2")]
510// Emits `vmovddup` instead of `vpbroadcastq`
511// See https://github.com/rust-lang/stdarch/issues/791
512#[cfg_attr(test, assert_instr(vmovddup))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
515pub const fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
516    unsafe {
517        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
518        transmute::<i64x2, _>(ret)
519    }
520}
521
522/// Broadcasts the low packed 64-bit integer from `a` to all elements of
523/// the 256-bit returned value.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
526#[inline]
527#[target_feature(enable = "avx2")]
528#[cfg_attr(test, assert_instr(vbroadcastsd))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
531pub const fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
532    unsafe {
533        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
534        transmute::<i64x4, _>(ret)
535    }
536}
537
538/// Broadcasts the low double-precision (64-bit) floating-point element
539/// from `a` to all elements of the 128-bit returned value.
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
542#[inline]
543#[target_feature(enable = "avx2")]
544#[cfg_attr(test, assert_instr(vmovddup))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
547pub const fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
548    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
549}
550
551/// Broadcasts the low double-precision (64-bit) floating-point element
552/// from `a` to all elements of the 256-bit returned value.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
555#[inline]
556#[target_feature(enable = "avx2")]
557#[cfg_attr(test, assert_instr(vbroadcastsd))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
560pub const fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
561    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
562}
563
564/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
565/// the 256-bit returned value.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
568#[inline]
569#[target_feature(enable = "avx2")]
570#[stable(feature = "simd_x86_updates", since = "1.82.0")]
571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
572pub const fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
573    unsafe {
574        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
575        transmute::<i64x4, _>(ret)
576    }
577}
578
579// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
580// `vbroadcastf128`.
581/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
582/// the 256-bit returned value.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
585#[inline]
586#[target_feature(enable = "avx2")]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
589pub const fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
590    unsafe {
591        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
592        transmute::<i64x4, _>(ret)
593    }
594}
595
596/// Broadcasts the low single-precision (32-bit) floating-point element
597/// from `a` to all elements of the 128-bit returned value.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
600#[inline]
601#[target_feature(enable = "avx2")]
602#[cfg_attr(test, assert_instr(vbroadcastss))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
605pub const fn _mm_broadcastss_ps(a: __m128) -> __m128 {
606    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
607}
608
609/// Broadcasts the low single-precision (32-bit) floating-point element
610/// from `a` to all elements of the 256-bit returned value.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
613#[inline]
614#[target_feature(enable = "avx2")]
615#[cfg_attr(test, assert_instr(vbroadcastss))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
618pub const fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
619    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
620}
621
622/// Broadcasts the low packed 16-bit integer from a to all elements of
623/// the 128-bit returned value
624///
625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
626#[inline]
627#[target_feature(enable = "avx2")]
628#[cfg_attr(test, assert_instr(vpbroadcastw))]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
632    unsafe {
633        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
634        transmute::<i16x8, _>(ret)
635    }
636}
637
638/// Broadcasts the low packed 16-bit integer from a to all elements of
639/// the 256-bit returned value
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
642#[inline]
643#[target_feature(enable = "avx2")]
644#[cfg_attr(test, assert_instr(vpbroadcastw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
647pub const fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
648    unsafe {
649        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
650        transmute::<i16x16, _>(ret)
651    }
652}
653
654/// Compares packed 64-bit integers in `a` and `b` for equality.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vpcmpeqq))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
662pub const fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
663    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
664}
665
666/// Compares packed 32-bit integers in `a` and `b` for equality.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vpcmpeqd))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
674pub const fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
675    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
676}
677
678/// Compares packed 16-bit integers in `a` and `b` for equality.
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpcmpeqw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
686pub const fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
687    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
688}
689
690/// Compares packed 8-bit integers in `a` and `b` for equality.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
693#[inline]
694#[target_feature(enable = "avx2")]
695#[cfg_attr(test, assert_instr(vpcmpeqb))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
698pub const fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
699    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
700}
701
702/// Compares packed 64-bit integers in `a` and `b` for greater-than.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
705#[inline]
706#[target_feature(enable = "avx2")]
707#[cfg_attr(test, assert_instr(vpcmpgtq))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
710pub const fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
711    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
712}
713
714/// Compares packed 32-bit integers in `a` and `b` for greater-than.
715///
716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
717#[inline]
718#[target_feature(enable = "avx2")]
719#[cfg_attr(test, assert_instr(vpcmpgtd))]
720#[stable(feature = "simd_x86", since = "1.27.0")]
721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
722pub const fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
723    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
724}
725
726/// Compares packed 16-bit integers in `a` and `b` for greater-than.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
729#[inline]
730#[target_feature(enable = "avx2")]
731#[cfg_attr(test, assert_instr(vpcmpgtw))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
734pub const fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
735    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
736}
737
738/// Compares packed 8-bit integers in `a` and `b` for greater-than.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
741#[inline]
742#[target_feature(enable = "avx2")]
743#[cfg_attr(test, assert_instr(vpcmpgtb))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
746pub const fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
747    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
748}
749
750/// Sign-extend 16-bit integers to 32-bit integers.
751///
752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
753#[inline]
754#[target_feature(enable = "avx2")]
755#[cfg_attr(test, assert_instr(vpmovsxwd))]
756#[stable(feature = "simd_x86", since = "1.27.0")]
757#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
758pub const fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
759    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
760}
761
762/// Sign-extend 16-bit integers to 64-bit integers.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpmovsxwq))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
771    unsafe {
772        let a = a.as_i16x8();
773        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774        transmute::<i64x4, _>(simd_cast(v64))
775    }
776}
777
778/// Sign-extend 32-bit integers to 64-bit integers.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
781#[inline]
782#[target_feature(enable = "avx2")]
783#[cfg_attr(test, assert_instr(vpmovsxdq))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
786pub const fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
787    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
788}
789
790/// Sign-extend 8-bit integers to 16-bit integers.
791///
792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
793#[inline]
794#[target_feature(enable = "avx2")]
795#[cfg_attr(test, assert_instr(vpmovsxbw))]
796#[stable(feature = "simd_x86", since = "1.27.0")]
797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
798pub const fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
799    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
800}
801
802/// Sign-extend 8-bit integers to 32-bit integers.
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
805#[inline]
806#[target_feature(enable = "avx2")]
807#[cfg_attr(test, assert_instr(vpmovsxbd))]
808#[stable(feature = "simd_x86", since = "1.27.0")]
809#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
810pub const fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
811    unsafe {
812        let a = a.as_i8x16();
813        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
814        transmute::<i32x8, _>(simd_cast(v64))
815    }
816}
817
818/// Sign-extend 8-bit integers to 64-bit integers.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
821#[inline]
822#[target_feature(enable = "avx2")]
823#[cfg_attr(test, assert_instr(vpmovsxbq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
826pub const fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
827    unsafe {
828        let a = a.as_i8x16();
829        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
830        transmute::<i64x4, _>(simd_cast(v32))
831    }
832}
833
834/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
835/// integers, and stores the results in `dst`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
838#[inline]
839#[target_feature(enable = "avx2")]
840#[cfg_attr(test, assert_instr(vpmovzxwd))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
843pub const fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
844    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
845}
846
847/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
848/// integers. The upper four elements of `a` are unused.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
851#[inline]
852#[target_feature(enable = "avx2")]
853#[cfg_attr(test, assert_instr(vpmovzxwq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
856pub const fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
857    unsafe {
858        let a = a.as_u16x8();
859        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
860        transmute::<i64x4, _>(simd_cast(v64))
861    }
862}
863
864/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
867#[inline]
868#[target_feature(enable = "avx2")]
869#[cfg_attr(test, assert_instr(vpmovzxdq))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
872pub const fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
873    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
874}
875
876/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
879#[inline]
880#[target_feature(enable = "avx2")]
881#[cfg_attr(test, assert_instr(vpmovzxbw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
884pub const fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
885    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
886}
887
888/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
889/// integers. The upper eight elements of `a` are unused.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
892#[inline]
893#[target_feature(enable = "avx2")]
894#[cfg_attr(test, assert_instr(vpmovzxbd))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
897pub const fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
898    unsafe {
899        let a = a.as_u8x16();
900        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
901        transmute::<i32x8, _>(simd_cast(v64))
902    }
903}
904
905/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
906/// integers. The upper twelve elements of `a` are unused.
907///
908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
909#[inline]
910#[target_feature(enable = "avx2")]
911#[cfg_attr(test, assert_instr(vpmovzxbq))]
912#[stable(feature = "simd_x86", since = "1.27.0")]
913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
914pub const fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
915    unsafe {
916        let a = a.as_u8x16();
917        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
918        transmute::<i64x4, _>(simd_cast(v32))
919    }
920}
921
922/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
925#[inline]
926#[target_feature(enable = "avx2")]
927#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
928#[rustc_legacy_const_generics(1)]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
932    static_assert_uimm_bits!(IMM1, 1);
933    unsafe {
934        let a = a.as_i64x4();
935        let b = i64x4::ZERO;
936        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
937        transmute(dst)
938    }
939}
940
941/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
944#[inline]
945#[target_feature(enable = "avx2")]
946#[cfg_attr(test, assert_instr(vphaddw))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950    let a = a.as_i16x16();
951    let b = b.as_i16x16();
952    unsafe {
953        let even: i16x16 = simd_shuffle!(
954            a,
955            b,
956            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957        );
958        let odd: i16x16 = simd_shuffle!(
959            a,
960            b,
961            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962        );
963        simd_add(even, odd).as_m256i()
964    }
965}
966
967/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
970#[inline]
971#[target_feature(enable = "avx2")]
972#[cfg_attr(test, assert_instr(vphaddd))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
975pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976    let a = a.as_i32x8();
977    let b = b.as_i32x8();
978    unsafe {
979        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981        simd_add(even, odd).as_m256i()
982    }
983}
984
985/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
986/// using saturation.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
989#[inline]
990#[target_feature(enable = "avx2")]
991#[cfg_attr(test, assert_instr(vphaddsw))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994    let a = a.as_i16x16();
995    let b = b.as_i16x16();
996    unsafe {
997        let even: i16x16 = simd_shuffle!(
998            a,
999            b,
1000            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1001        );
1002        let odd: i16x16 = simd_shuffle!(
1003            a,
1004            b,
1005            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1006        );
1007        simd_saturating_add(even, odd).as_m256i()
1008    }
1009}
1010
1011/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1012///
1013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1014#[inline]
1015#[target_feature(enable = "avx2")]
1016#[cfg_attr(test, assert_instr(vphsubw))]
1017#[stable(feature = "simd_x86", since = "1.27.0")]
1018#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1019pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1020    let a = a.as_i16x16();
1021    let b = b.as_i16x16();
1022    unsafe {
1023        let even: i16x16 = simd_shuffle!(
1024            a,
1025            b,
1026            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1027        );
1028        let odd: i16x16 = simd_shuffle!(
1029            a,
1030            b,
1031            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1032        );
1033        simd_sub(even, odd).as_m256i()
1034    }
1035}
1036
1037/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1040#[inline]
1041#[target_feature(enable = "avx2")]
1042#[cfg_attr(test, assert_instr(vphsubd))]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1045pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1046    let a = a.as_i32x8();
1047    let b = b.as_i32x8();
1048    unsafe {
1049        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1050        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1051        simd_sub(even, odd).as_m256i()
1052    }
1053}
1054
1055/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1056/// using saturation.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1059#[inline]
1060#[target_feature(enable = "avx2")]
1061#[cfg_attr(test, assert_instr(vphsubsw))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1064    let a = a.as_i16x16();
1065    let b = b.as_i16x16();
1066    unsafe {
1067        let even: i16x16 = simd_shuffle!(
1068            a,
1069            b,
1070            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1071        );
1072        let odd: i16x16 = simd_shuffle!(
1073            a,
1074            b,
1075            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1076        );
1077        simd_saturating_sub(even, odd).as_m256i()
1078    }
1079}
1080
1081/// Returns values from `slice` at offsets determined by `offsets * scale`,
1082/// where
1083/// `scale` should be 1, 2, 4 or 8.
1084///
1085/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1086#[inline]
1087#[target_feature(enable = "avx2")]
1088#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1089#[rustc_legacy_const_generics(2)]
1090#[stable(feature = "simd_x86", since = "1.27.0")]
1091pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1092    slice: *const i32,
1093    offsets: __m128i,
1094) -> __m128i {
1095    static_assert_imm8_scale!(SCALE);
1096    let zero = i32x4::ZERO;
1097    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1098    let offsets = offsets.as_i32x4();
1099    let slice = slice as *const i8;
1100    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1101    transmute(r)
1102}
1103
1104/// Returns values from `slice` at offsets determined by `offsets * scale`,
1105/// where
1106/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1107/// that position instead.
1108///
1109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1110#[inline]
1111#[target_feature(enable = "avx2")]
1112#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1113#[rustc_legacy_const_generics(4)]
1114#[stable(feature = "simd_x86", since = "1.27.0")]
1115pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1116    src: __m128i,
1117    slice: *const i32,
1118    offsets: __m128i,
1119    mask: __m128i,
1120) -> __m128i {
1121    static_assert_imm8_scale!(SCALE);
1122    let src = src.as_i32x4();
1123    let mask = mask.as_i32x4();
1124    let offsets = offsets.as_i32x4();
1125    let slice = slice as *const i8;
1126    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1127    transmute(r)
1128}
1129
1130/// Returns values from `slice` at offsets determined by `offsets * scale`,
1131/// where
1132/// `scale` should be 1, 2, 4 or 8.
1133///
1134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1135#[inline]
1136#[target_feature(enable = "avx2")]
1137#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1138#[rustc_legacy_const_generics(2)]
1139#[stable(feature = "simd_x86", since = "1.27.0")]
1140pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1141    slice: *const i32,
1142    offsets: __m256i,
1143) -> __m256i {
1144    static_assert_imm8_scale!(SCALE);
1145    let zero = i32x8::ZERO;
1146    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1147    let offsets = offsets.as_i32x8();
1148    let slice = slice as *const i8;
1149    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1150    transmute(r)
1151}
1152
1153/// Returns values from `slice` at offsets determined by `offsets * scale`,
1154/// where
1155/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1156/// that position instead.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1159#[inline]
1160#[target_feature(enable = "avx2")]
1161#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1162#[rustc_legacy_const_generics(4)]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1165    src: __m256i,
1166    slice: *const i32,
1167    offsets: __m256i,
1168    mask: __m256i,
1169) -> __m256i {
1170    static_assert_imm8_scale!(SCALE);
1171    let src = src.as_i32x8();
1172    let mask = mask.as_i32x8();
1173    let offsets = offsets.as_i32x8();
1174    let slice = slice as *const i8;
1175    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1176    transmute(r)
1177}
1178
1179/// Returns values from `slice` at offsets determined by `offsets * scale`,
1180/// where
1181/// `scale` should be 1, 2, 4 or 8.
1182///
1183/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1184#[inline]
1185#[target_feature(enable = "avx2")]
1186#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1187#[rustc_legacy_const_generics(2)]
1188#[stable(feature = "simd_x86", since = "1.27.0")]
1189pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1190    static_assert_imm8_scale!(SCALE);
1191    let zero = _mm_setzero_ps();
1192    let neg_one = _mm_set1_ps(-1.0);
1193    let offsets = offsets.as_i32x4();
1194    let slice = slice as *const i8;
1195    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1196}
1197
1198/// Returns values from `slice` at offsets determined by `offsets * scale`,
1199/// where
1200/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1201/// that position instead.
1202///
1203/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1204#[inline]
1205#[target_feature(enable = "avx2")]
1206#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1207#[rustc_legacy_const_generics(4)]
1208#[stable(feature = "simd_x86", since = "1.27.0")]
1209pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1210    src: __m128,
1211    slice: *const f32,
1212    offsets: __m128i,
1213    mask: __m128,
1214) -> __m128 {
1215    static_assert_imm8_scale!(SCALE);
1216    let offsets = offsets.as_i32x4();
1217    let slice = slice as *const i8;
1218    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1219}
1220
1221/// Returns values from `slice` at offsets determined by `offsets * scale`,
1222/// where
1223/// `scale` should be 1, 2, 4 or 8.
1224///
1225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1226#[inline]
1227#[target_feature(enable = "avx2")]
1228#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1229#[rustc_legacy_const_generics(2)]
1230#[stable(feature = "simd_x86", since = "1.27.0")]
1231pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1232    static_assert_imm8_scale!(SCALE);
1233    let zero = _mm256_setzero_ps();
1234    let neg_one = _mm256_set1_ps(-1.0);
1235    let offsets = offsets.as_i32x8();
1236    let slice = slice as *const i8;
1237    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1238}
1239
1240/// Returns values from `slice` at offsets determined by `offsets * scale`,
1241/// where
1242/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1243/// that position instead.
1244///
1245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1246#[inline]
1247#[target_feature(enable = "avx2")]
1248#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1249#[rustc_legacy_const_generics(4)]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1252    src: __m256,
1253    slice: *const f32,
1254    offsets: __m256i,
1255    mask: __m256,
1256) -> __m256 {
1257    static_assert_imm8_scale!(SCALE);
1258    let offsets = offsets.as_i32x8();
1259    let slice = slice as *const i8;
1260    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1261}
1262
1263/// Returns values from `slice` at offsets determined by `offsets * scale`,
1264/// where
1265/// `scale` should be 1, 2, 4 or 8.
1266///
1267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1268#[inline]
1269#[target_feature(enable = "avx2")]
1270#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1271#[rustc_legacy_const_generics(2)]
1272#[stable(feature = "simd_x86", since = "1.27.0")]
1273pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1274    slice: *const i64,
1275    offsets: __m128i,
1276) -> __m128i {
1277    static_assert_imm8_scale!(SCALE);
1278    let zero = i64x2::ZERO;
1279    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1280    let offsets = offsets.as_i32x4();
1281    let slice = slice as *const i8;
1282    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1283    transmute(r)
1284}
1285
1286/// Returns values from `slice` at offsets determined by `offsets * scale`,
1287/// where
1288/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1289/// that position instead.
1290///
1291/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1292#[inline]
1293#[target_feature(enable = "avx2")]
1294#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1295#[rustc_legacy_const_generics(4)]
1296#[stable(feature = "simd_x86", since = "1.27.0")]
1297pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1298    src: __m128i,
1299    slice: *const i64,
1300    offsets: __m128i,
1301    mask: __m128i,
1302) -> __m128i {
1303    static_assert_imm8_scale!(SCALE);
1304    let src = src.as_i64x2();
1305    let mask = mask.as_i64x2();
1306    let offsets = offsets.as_i32x4();
1307    let slice = slice as *const i8;
1308    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1309    transmute(r)
1310}
1311
1312/// Returns values from `slice` at offsets determined by `offsets * scale`,
1313/// where
1314/// `scale` should be 1, 2, 4 or 8.
1315///
1316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1317#[inline]
1318#[target_feature(enable = "avx2")]
1319#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1320#[rustc_legacy_const_generics(2)]
1321#[stable(feature = "simd_x86", since = "1.27.0")]
1322pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1323    slice: *const i64,
1324    offsets: __m128i,
1325) -> __m256i {
1326    static_assert_imm8_scale!(SCALE);
1327    let zero = i64x4::ZERO;
1328    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1329    let offsets = offsets.as_i32x4();
1330    let slice = slice as *const i8;
1331    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1332    transmute(r)
1333}
1334
1335/// Returns values from `slice` at offsets determined by `offsets * scale`,
1336/// where
1337/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1338/// that position instead.
1339///
1340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1341#[inline]
1342#[target_feature(enable = "avx2")]
1343#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1344#[rustc_legacy_const_generics(4)]
1345#[stable(feature = "simd_x86", since = "1.27.0")]
1346pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1347    src: __m256i,
1348    slice: *const i64,
1349    offsets: __m128i,
1350    mask: __m256i,
1351) -> __m256i {
1352    static_assert_imm8_scale!(SCALE);
1353    let src = src.as_i64x4();
1354    let mask = mask.as_i64x4();
1355    let offsets = offsets.as_i32x4();
1356    let slice = slice as *const i8;
1357    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1358    transmute(r)
1359}
1360
1361/// Returns values from `slice` at offsets determined by `offsets * scale`,
1362/// where
1363/// `scale` should be 1, 2, 4 or 8.
1364///
1365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1366#[inline]
1367#[target_feature(enable = "avx2")]
1368#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1369#[rustc_legacy_const_generics(2)]
1370#[stable(feature = "simd_x86", since = "1.27.0")]
1371pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1372    static_assert_imm8_scale!(SCALE);
1373    let zero = _mm_setzero_pd();
1374    let neg_one = _mm_set1_pd(-1.0);
1375    let offsets = offsets.as_i32x4();
1376    let slice = slice as *const i8;
1377    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1378}
1379
1380/// Returns values from `slice` at offsets determined by `offsets * scale`,
1381/// where
1382/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1383/// that position instead.
1384///
1385/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1386#[inline]
1387#[target_feature(enable = "avx2")]
1388#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1389#[rustc_legacy_const_generics(4)]
1390#[stable(feature = "simd_x86", since = "1.27.0")]
1391pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1392    src: __m128d,
1393    slice: *const f64,
1394    offsets: __m128i,
1395    mask: __m128d,
1396) -> __m128d {
1397    static_assert_imm8_scale!(SCALE);
1398    let offsets = offsets.as_i32x4();
1399    let slice = slice as *const i8;
1400    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1401}
1402
1403/// Returns values from `slice` at offsets determined by `offsets * scale`,
1404/// where
1405/// `scale` should be 1, 2, 4 or 8.
1406///
1407/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1408#[inline]
1409#[target_feature(enable = "avx2")]
1410#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1411#[rustc_legacy_const_generics(2)]
1412#[stable(feature = "simd_x86", since = "1.27.0")]
1413pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1414    slice: *const f64,
1415    offsets: __m128i,
1416) -> __m256d {
1417    static_assert_imm8_scale!(SCALE);
1418    let zero = _mm256_setzero_pd();
1419    let neg_one = _mm256_set1_pd(-1.0);
1420    let offsets = offsets.as_i32x4();
1421    let slice = slice as *const i8;
1422    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1423}
1424
1425/// Returns values from `slice` at offsets determined by `offsets * scale`,
1426/// where
1427/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1428/// that position instead.
1429///
1430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1431#[inline]
1432#[target_feature(enable = "avx2")]
1433#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1434#[rustc_legacy_const_generics(4)]
1435#[stable(feature = "simd_x86", since = "1.27.0")]
1436pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1437    src: __m256d,
1438    slice: *const f64,
1439    offsets: __m128i,
1440    mask: __m256d,
1441) -> __m256d {
1442    static_assert_imm8_scale!(SCALE);
1443    let offsets = offsets.as_i32x4();
1444    let slice = slice as *const i8;
1445    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1446}
1447
1448/// Returns values from `slice` at offsets determined by `offsets * scale`,
1449/// where
1450/// `scale` should be 1, 2, 4 or 8.
1451///
1452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1453#[inline]
1454#[target_feature(enable = "avx2")]
1455#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1456#[rustc_legacy_const_generics(2)]
1457#[stable(feature = "simd_x86", since = "1.27.0")]
1458pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1459    slice: *const i32,
1460    offsets: __m128i,
1461) -> __m128i {
1462    static_assert_imm8_scale!(SCALE);
1463    let zero = i32x4::ZERO;
1464    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1465    let offsets = offsets.as_i64x2();
1466    let slice = slice as *const i8;
1467    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1468    transmute(r)
1469}
1470
1471/// Returns values from `slice` at offsets determined by `offsets * scale`,
1472/// where
1473/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1474/// that position instead.
1475///
1476/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1477#[inline]
1478#[target_feature(enable = "avx2")]
1479#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1480#[rustc_legacy_const_generics(4)]
1481#[stable(feature = "simd_x86", since = "1.27.0")]
1482pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1483    src: __m128i,
1484    slice: *const i32,
1485    offsets: __m128i,
1486    mask: __m128i,
1487) -> __m128i {
1488    static_assert_imm8_scale!(SCALE);
1489    let src = src.as_i32x4();
1490    let mask = mask.as_i32x4();
1491    let offsets = offsets.as_i64x2();
1492    let slice = slice as *const i8;
1493    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1494    transmute(r)
1495}
1496
1497/// Returns values from `slice` at offsets determined by `offsets * scale`,
1498/// where
1499/// `scale` should be 1, 2, 4 or 8.
1500///
1501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1502#[inline]
1503#[target_feature(enable = "avx2")]
1504#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1505#[rustc_legacy_const_generics(2)]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1508    slice: *const i32,
1509    offsets: __m256i,
1510) -> __m128i {
1511    static_assert_imm8_scale!(SCALE);
1512    let zero = i32x4::ZERO;
1513    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1514    let offsets = offsets.as_i64x4();
1515    let slice = slice as *const i8;
1516    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1517    transmute(r)
1518}
1519
1520/// Returns values from `slice` at offsets determined by `offsets * scale`,
1521/// where
1522/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1523/// that position instead.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1526#[inline]
1527#[target_feature(enable = "avx2")]
1528#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1529#[rustc_legacy_const_generics(4)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1532    src: __m128i,
1533    slice: *const i32,
1534    offsets: __m256i,
1535    mask: __m128i,
1536) -> __m128i {
1537    static_assert_imm8_scale!(SCALE);
1538    let src = src.as_i32x4();
1539    let mask = mask.as_i32x4();
1540    let offsets = offsets.as_i64x4();
1541    let slice = slice as *const i8;
1542    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1543    transmute(r)
1544}
1545
1546/// Returns values from `slice` at offsets determined by `offsets * scale`,
1547/// where
1548/// `scale` should be 1, 2, 4 or 8.
1549///
1550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1551#[inline]
1552#[target_feature(enable = "avx2")]
1553#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1554#[rustc_legacy_const_generics(2)]
1555#[stable(feature = "simd_x86", since = "1.27.0")]
1556pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1557    static_assert_imm8_scale!(SCALE);
1558    let zero = _mm_setzero_ps();
1559    let neg_one = _mm_set1_ps(-1.0);
1560    let offsets = offsets.as_i64x2();
1561    let slice = slice as *const i8;
1562    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1563}
1564
1565/// Returns values from `slice` at offsets determined by `offsets * scale`,
1566/// where
1567/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1568/// that position instead.
1569///
1570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1571#[inline]
1572#[target_feature(enable = "avx2")]
1573#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1574#[rustc_legacy_const_generics(4)]
1575#[stable(feature = "simd_x86", since = "1.27.0")]
1576pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1577    src: __m128,
1578    slice: *const f32,
1579    offsets: __m128i,
1580    mask: __m128,
1581) -> __m128 {
1582    static_assert_imm8_scale!(SCALE);
1583    let offsets = offsets.as_i64x2();
1584    let slice = slice as *const i8;
1585    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1586}
1587
1588/// Returns values from `slice` at offsets determined by `offsets * scale`,
1589/// where
1590/// `scale` should be 1, 2, 4 or 8.
1591///
1592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1593#[inline]
1594#[target_feature(enable = "avx2")]
1595#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1596#[rustc_legacy_const_generics(2)]
1597#[stable(feature = "simd_x86", since = "1.27.0")]
1598pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1599    static_assert_imm8_scale!(SCALE);
1600    let zero = _mm_setzero_ps();
1601    let neg_one = _mm_set1_ps(-1.0);
1602    let offsets = offsets.as_i64x4();
1603    let slice = slice as *const i8;
1604    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1605}
1606
1607/// Returns values from `slice` at offsets determined by `offsets * scale`,
1608/// where
1609/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1610/// that position instead.
1611///
1612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1613#[inline]
1614#[target_feature(enable = "avx2")]
1615#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1616#[rustc_legacy_const_generics(4)]
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1619    src: __m128,
1620    slice: *const f32,
1621    offsets: __m256i,
1622    mask: __m128,
1623) -> __m128 {
1624    static_assert_imm8_scale!(SCALE);
1625    let offsets = offsets.as_i64x4();
1626    let slice = slice as *const i8;
1627    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1628}
1629
1630/// Returns values from `slice` at offsets determined by `offsets * scale`,
1631/// where
1632/// `scale` should be 1, 2, 4 or 8.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1635#[inline]
1636#[target_feature(enable = "avx2")]
1637#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1638#[rustc_legacy_const_generics(2)]
1639#[stable(feature = "simd_x86", since = "1.27.0")]
1640pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1641    slice: *const i64,
1642    offsets: __m128i,
1643) -> __m128i {
1644    static_assert_imm8_scale!(SCALE);
1645    let zero = i64x2::ZERO;
1646    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1647    let slice = slice as *const i8;
1648    let offsets = offsets.as_i64x2();
1649    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1650    transmute(r)
1651}
1652
1653/// Returns values from `slice` at offsets determined by `offsets * scale`,
1654/// where
1655/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1656/// that position instead.
1657///
1658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1659#[inline]
1660#[target_feature(enable = "avx2")]
1661#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1662#[rustc_legacy_const_generics(4)]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1665    src: __m128i,
1666    slice: *const i64,
1667    offsets: __m128i,
1668    mask: __m128i,
1669) -> __m128i {
1670    static_assert_imm8_scale!(SCALE);
1671    let src = src.as_i64x2();
1672    let mask = mask.as_i64x2();
1673    let offsets = offsets.as_i64x2();
1674    let slice = slice as *const i8;
1675    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1676    transmute(r)
1677}
1678
1679/// Returns values from `slice` at offsets determined by `offsets * scale`,
1680/// where
1681/// `scale` should be 1, 2, 4 or 8.
1682///
1683/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1684#[inline]
1685#[target_feature(enable = "avx2")]
1686#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1687#[rustc_legacy_const_generics(2)]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1690    slice: *const i64,
1691    offsets: __m256i,
1692) -> __m256i {
1693    static_assert_imm8_scale!(SCALE);
1694    let zero = i64x4::ZERO;
1695    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1696    let slice = slice as *const i8;
1697    let offsets = offsets.as_i64x4();
1698    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1699    transmute(r)
1700}
1701
1702/// Returns values from `slice` at offsets determined by `offsets * scale`,
1703/// where
1704/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1705/// that position instead.
1706///
1707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1708#[inline]
1709#[target_feature(enable = "avx2")]
1710#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1711#[rustc_legacy_const_generics(4)]
1712#[stable(feature = "simd_x86", since = "1.27.0")]
1713pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1714    src: __m256i,
1715    slice: *const i64,
1716    offsets: __m256i,
1717    mask: __m256i,
1718) -> __m256i {
1719    static_assert_imm8_scale!(SCALE);
1720    let src = src.as_i64x4();
1721    let mask = mask.as_i64x4();
1722    let offsets = offsets.as_i64x4();
1723    let slice = slice as *const i8;
1724    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1725    transmute(r)
1726}
1727
1728/// Returns values from `slice` at offsets determined by `offsets * scale`,
1729/// where
1730/// `scale` should be 1, 2, 4 or 8.
1731///
1732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1733#[inline]
1734#[target_feature(enable = "avx2")]
1735#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1736#[rustc_legacy_const_generics(2)]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1739    static_assert_imm8_scale!(SCALE);
1740    let zero = _mm_setzero_pd();
1741    let neg_one = _mm_set1_pd(-1.0);
1742    let slice = slice as *const i8;
1743    let offsets = offsets.as_i64x2();
1744    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1745}
1746
1747/// Returns values from `slice` at offsets determined by `offsets * scale`,
1748/// where
1749/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1750/// that position instead.
1751///
1752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1753#[inline]
1754#[target_feature(enable = "avx2")]
1755#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1756#[rustc_legacy_const_generics(4)]
1757#[stable(feature = "simd_x86", since = "1.27.0")]
1758pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1759    src: __m128d,
1760    slice: *const f64,
1761    offsets: __m128i,
1762    mask: __m128d,
1763) -> __m128d {
1764    static_assert_imm8_scale!(SCALE);
1765    let slice = slice as *const i8;
1766    let offsets = offsets.as_i64x2();
1767    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1768}
1769
1770/// Returns values from `slice` at offsets determined by `offsets * scale`,
1771/// where
1772/// `scale` should be 1, 2, 4 or 8.
1773///
1774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1775#[inline]
1776#[target_feature(enable = "avx2")]
1777#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1778#[rustc_legacy_const_generics(2)]
1779#[stable(feature = "simd_x86", since = "1.27.0")]
1780pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1781    slice: *const f64,
1782    offsets: __m256i,
1783) -> __m256d {
1784    static_assert_imm8_scale!(SCALE);
1785    let zero = _mm256_setzero_pd();
1786    let neg_one = _mm256_set1_pd(-1.0);
1787    let slice = slice as *const i8;
1788    let offsets = offsets.as_i64x4();
1789    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1790}
1791
1792/// Returns values from `slice` at offsets determined by `offsets * scale`,
1793/// where
1794/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1795/// that position instead.
1796///
1797/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1798#[inline]
1799#[target_feature(enable = "avx2")]
1800#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1801#[rustc_legacy_const_generics(4)]
1802#[stable(feature = "simd_x86", since = "1.27.0")]
1803pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1804    src: __m256d,
1805    slice: *const f64,
1806    offsets: __m256i,
1807    mask: __m256d,
1808) -> __m256d {
1809    static_assert_imm8_scale!(SCALE);
1810    let slice = slice as *const i8;
1811    let offsets = offsets.as_i64x4();
1812    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1813}
1814
1815/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1816/// location specified by `IMM1`.
1817///
1818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1819#[inline]
1820#[target_feature(enable = "avx2")]
1821#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1822#[rustc_legacy_const_generics(2)]
1823#[stable(feature = "simd_x86", since = "1.27.0")]
1824#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1825pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1826    static_assert_uimm_bits!(IMM1, 1);
1827    unsafe {
1828        let a = a.as_i64x4();
1829        let b = _mm256_castsi128_si256(b).as_i64x4();
1830        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1831        transmute(dst)
1832    }
1833}
1834
1835/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1836/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1837/// of intermediate 32-bit integers.
1838///
1839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1840#[inline]
1841#[target_feature(enable = "avx2")]
1842#[cfg_attr(test, assert_instr(vpmaddwd))]
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1845    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
1846    //
1847    // ```rust
1848    // #[target_feature(enable = "avx2")]
1849    // unsafe fn widening_add(mad: __m256i) -> __m256i {
1850    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1851    // }
1852    // ```
1853    //
1854    // If we implement this using generic vector intrinsics, the optimizer
1855    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1856    // For this reason, we use x86 intrinsics.
1857    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1858}
1859
1860/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1861/// corresponding signed 8-bit integer from `b`, producing intermediate
1862/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1863/// signed 16-bit integers
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1866#[inline]
1867#[target_feature(enable = "avx2")]
1868#[cfg_attr(test, assert_instr(vpmaddubsw))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1871    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1872}
1873
1874/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1875/// (elements are zeroed out when the highest bit is not set in the
1876/// corresponding element).
1877///
1878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1879#[inline]
1880#[target_feature(enable = "avx2")]
1881#[cfg_attr(test, assert_instr(vpmaskmovd))]
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1884pub const unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1885    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1886    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1887}
1888
1889/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1890/// (elements are zeroed out when the highest bit is not set in the
1891/// corresponding element).
1892///
1893/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1894#[inline]
1895#[target_feature(enable = "avx2")]
1896#[cfg_attr(test, assert_instr(vpmaskmovd))]
1897#[stable(feature = "simd_x86", since = "1.27.0")]
1898#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1899pub const unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1900    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1901    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1902}
1903
1904/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1905/// (elements are zeroed out when the highest bit is not set in the
1906/// corresponding element).
1907///
1908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1909#[inline]
1910#[target_feature(enable = "avx2")]
1911#[cfg_attr(test, assert_instr(vpmaskmovq))]
1912#[stable(feature = "simd_x86", since = "1.27.0")]
1913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1914pub const unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1915    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1916    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1917}
1918
1919/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1920/// (elements are zeroed out when the highest bit is not set in the
1921/// corresponding element).
1922///
1923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1924#[inline]
1925#[target_feature(enable = "avx2")]
1926#[cfg_attr(test, assert_instr(vpmaskmovq))]
1927#[stable(feature = "simd_x86", since = "1.27.0")]
1928#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1929pub const unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1930    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1931    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1932}
1933
1934/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1935/// using `mask` (elements are not stored when the highest bit is not set
1936/// in the corresponding element).
1937///
1938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1939#[inline]
1940#[target_feature(enable = "avx2")]
1941#[cfg_attr(test, assert_instr(vpmaskmovd))]
1942#[stable(feature = "simd_x86", since = "1.27.0")]
1943#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1944pub const unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1945    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1946    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1947}
1948
1949/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1950/// using `mask` (elements are not stored when the highest bit is not set
1951/// in the corresponding element).
1952///
1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1954#[inline]
1955#[target_feature(enable = "avx2")]
1956#[cfg_attr(test, assert_instr(vpmaskmovd))]
1957#[stable(feature = "simd_x86", since = "1.27.0")]
1958#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1959pub const unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1960    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1961    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1962}
1963
1964/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1965/// using `mask` (elements are not stored when the highest bit is not set
1966/// in the corresponding element).
1967///
1968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1969#[inline]
1970#[target_feature(enable = "avx2")]
1971#[cfg_attr(test, assert_instr(vpmaskmovq))]
1972#[stable(feature = "simd_x86", since = "1.27.0")]
1973#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1974pub const unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1975    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1976    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1977}
1978
1979/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1980/// using `mask` (elements are not stored when the highest bit is not set
1981/// in the corresponding element).
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1984#[inline]
1985#[target_feature(enable = "avx2")]
1986#[cfg_attr(test, assert_instr(vpmaskmovq))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1989pub const unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1990    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1991    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1992}
1993
1994/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1995/// maximum values.
1996///
1997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1998#[inline]
1999#[target_feature(enable = "avx2")]
2000#[cfg_attr(test, assert_instr(vpmaxsw))]
2001#[stable(feature = "simd_x86", since = "1.27.0")]
2002#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2003pub const fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2004    unsafe { simd_imax(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2005}
2006
2007/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2008/// maximum values.
2009///
2010/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
2011#[inline]
2012#[target_feature(enable = "avx2")]
2013#[cfg_attr(test, assert_instr(vpmaxsd))]
2014#[stable(feature = "simd_x86", since = "1.27.0")]
2015#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2016pub const fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2017    unsafe { simd_imax(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2018}
2019
2020/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2021/// maximum values.
2022///
2023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
2024#[inline]
2025#[target_feature(enable = "avx2")]
2026#[cfg_attr(test, assert_instr(vpmaxsb))]
2027#[stable(feature = "simd_x86", since = "1.27.0")]
2028#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2029pub const fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2030    unsafe { simd_imax(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2031}
2032
2033/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2034/// the packed maximum values.
2035///
2036/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
2037#[inline]
2038#[target_feature(enable = "avx2")]
2039#[cfg_attr(test, assert_instr(vpmaxuw))]
2040#[stable(feature = "simd_x86", since = "1.27.0")]
2041#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2042pub const fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2043    unsafe { simd_imax(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2044}
2045
2046/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2047/// the packed maximum values.
2048///
2049/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
2050#[inline]
2051#[target_feature(enable = "avx2")]
2052#[cfg_attr(test, assert_instr(vpmaxud))]
2053#[stable(feature = "simd_x86", since = "1.27.0")]
2054#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2055pub const fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2056    unsafe { simd_imax(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2057}
2058
2059/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2060/// the packed maximum values.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2063#[inline]
2064#[target_feature(enable = "avx2")]
2065#[cfg_attr(test, assert_instr(vpmaxub))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2068pub const fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2069    unsafe { simd_imax(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2070}
2071
2072/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2073/// minimum values.
2074///
2075/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2076#[inline]
2077#[target_feature(enable = "avx2")]
2078#[cfg_attr(test, assert_instr(vpminsw))]
2079#[stable(feature = "simd_x86", since = "1.27.0")]
2080#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2081pub const fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2082    unsafe { simd_imin(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2083}
2084
2085/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2086/// minimum values.
2087///
2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2089#[inline]
2090#[target_feature(enable = "avx2")]
2091#[cfg_attr(test, assert_instr(vpminsd))]
2092#[stable(feature = "simd_x86", since = "1.27.0")]
2093#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2094pub const fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2095    unsafe { simd_imin(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2096}
2097
2098/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2099/// minimum values.
2100///
2101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2102#[inline]
2103#[target_feature(enable = "avx2")]
2104#[cfg_attr(test, assert_instr(vpminsb))]
2105#[stable(feature = "simd_x86", since = "1.27.0")]
2106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2107pub const fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2108    unsafe { simd_imin(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2109}
2110
2111/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2112/// the packed minimum values.
2113///
2114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2115#[inline]
2116#[target_feature(enable = "avx2")]
2117#[cfg_attr(test, assert_instr(vpminuw))]
2118#[stable(feature = "simd_x86", since = "1.27.0")]
2119#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2120pub const fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2121    unsafe { simd_imin(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2122}
2123
2124/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2125/// the packed minimum values.
2126///
2127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2128#[inline]
2129#[target_feature(enable = "avx2")]
2130#[cfg_attr(test, assert_instr(vpminud))]
2131#[stable(feature = "simd_x86", since = "1.27.0")]
2132#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2133pub const fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2134    unsafe { simd_imin(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2135}
2136
2137/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2138/// the packed minimum values.
2139///
2140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2141#[inline]
2142#[target_feature(enable = "avx2")]
2143#[cfg_attr(test, assert_instr(vpminub))]
2144#[stable(feature = "simd_x86", since = "1.27.0")]
2145#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2146pub const fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2147    unsafe { simd_imin(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2148}
2149
2150/// Creates mask from the most significant bit of each 8-bit element in `a`,
2151/// return the result.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2154#[inline]
2155#[target_feature(enable = "avx2")]
2156#[cfg_attr(test, assert_instr(vpmovmskb))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2159pub const fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2160    unsafe {
2161        let z = i8x32::ZERO;
2162        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2163        simd_bitmask::<_, u32>(m) as i32
2164    }
2165}
2166
2167/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2168/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2169/// results in dst. Eight SADs are performed for each 128-bit lane using one
2170/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2171/// selected from `b` starting at on the offset specified in `imm8`. Eight
2172/// quadruplets are formed from sequential 8-bit integers selected from `a`
2173/// starting at the offset specified in `imm8`.
2174///
2175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2176#[inline]
2177#[target_feature(enable = "avx2")]
2178#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2179#[rustc_legacy_const_generics(2)]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2182    static_assert_uimm_bits!(IMM8, 8);
2183    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2184}
2185
2186/// Multiplies the low 32-bit integers from each packed 64-bit element in
2187/// `a` and `b`
2188///
2189/// Returns the 64-bit results.
2190///
2191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2192#[inline]
2193#[target_feature(enable = "avx2")]
2194#[cfg_attr(test, assert_instr(vpmuldq))]
2195#[stable(feature = "simd_x86", since = "1.27.0")]
2196#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2197pub const fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2198    unsafe {
2199        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2200        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2201        transmute(simd_mul(a, b))
2202    }
2203}
2204
2205/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2206/// element in `a` and `b`
2207///
2208/// Returns the unsigned 64-bit results.
2209///
2210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2211#[inline]
2212#[target_feature(enable = "avx2")]
2213#[cfg_attr(test, assert_instr(vpmuludq))]
2214#[stable(feature = "simd_x86", since = "1.27.0")]
2215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2216pub const fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2217    unsafe {
2218        let a = a.as_u64x4();
2219        let b = b.as_u64x4();
2220        let mask = u64x4::splat(u32::MAX as u64);
2221        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2222    }
2223}
2224
2225/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2226/// intermediate 32-bit integers and returning the high 16 bits of the
2227/// intermediate integers.
2228///
2229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2230#[inline]
2231#[target_feature(enable = "avx2")]
2232#[cfg_attr(test, assert_instr(vpmulhw))]
2233#[stable(feature = "simd_x86", since = "1.27.0")]
2234#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2235pub const fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2236    unsafe {
2237        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2238        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2239        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2240        transmute(simd_cast::<i32x16, i16x16>(r))
2241    }
2242}
2243
2244/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2245/// intermediate 32-bit integers and returning the high 16 bits of the
2246/// intermediate integers.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2249#[inline]
2250#[target_feature(enable = "avx2")]
2251#[cfg_attr(test, assert_instr(vpmulhuw))]
2252#[stable(feature = "simd_x86", since = "1.27.0")]
2253#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2254pub const fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2255    unsafe {
2256        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2257        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2258        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2259        transmute(simd_cast::<u32x16, u16x16>(r))
2260    }
2261}
2262
2263/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2264/// intermediate 32-bit integers, and returns the low 16 bits of the
2265/// intermediate integers
2266///
2267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2268#[inline]
2269#[target_feature(enable = "avx2")]
2270#[cfg_attr(test, assert_instr(vpmullw))]
2271#[stable(feature = "simd_x86", since = "1.27.0")]
2272#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2273pub const fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2274    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2275}
2276
2277/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2278/// intermediate 64-bit integers, and returns the low 32 bits of the
2279/// intermediate integers
2280///
2281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2282#[inline]
2283#[target_feature(enable = "avx2")]
2284#[cfg_attr(test, assert_instr(vpmulld))]
2285#[stable(feature = "simd_x86", since = "1.27.0")]
2286#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2287pub const fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2288    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2289}
2290
2291/// Multiplies packed 16-bit integers in `a` and `b`, producing
2292/// intermediate signed 32-bit integers. Truncate each intermediate
2293/// integer to the 18 most significant bits, round by adding 1, and
2294/// return bits `[16:1]`.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2297#[inline]
2298#[target_feature(enable = "avx2")]
2299#[cfg_attr(test, assert_instr(vpmulhrsw))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2302    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2303}
2304
2305/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2306/// and `b`
2307///
2308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2309#[inline]
2310#[target_feature(enable = "avx2")]
2311#[cfg_attr(test, assert_instr(vorps))]
2312#[stable(feature = "simd_x86", since = "1.27.0")]
2313#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2314pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2315    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2316}
2317
2318/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
2319/// using signed saturation
2320///
2321/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2322#[inline]
2323#[target_feature(enable = "avx2")]
2324#[cfg_attr(test, assert_instr(vpacksswb))]
2325#[stable(feature = "simd_x86", since = "1.27.0")]
2326#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2327pub const fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2328    unsafe {
2329        let max = simd_splat(i8::MAX as i16);
2330        let min = simd_splat(i8::MIN as i16);
2331
2332        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
2333            .as_m256i()
2334            .as_i8x32();
2335        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
2336            .as_m256i()
2337            .as_i8x32();
2338
2339        #[rustfmt::skip]
2340        const IDXS: [u32; 32] = [
2341            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to i8 conversions
2342            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
2343            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
2344            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
2345        ];
2346        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
2347
2348        result.as_m256i()
2349    }
2350}
2351
2352/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
2353/// using signed saturation
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2356#[inline]
2357#[target_feature(enable = "avx2")]
2358#[cfg_attr(test, assert_instr(vpackssdw))]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2361pub const fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2362    unsafe {
2363        let max = simd_splat(i16::MAX as i32);
2364        let min = simd_splat(i16::MIN as i32);
2365
2366        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
2367            .as_m256i()
2368            .as_i16x16();
2369        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
2370            .as_m256i()
2371            .as_i16x16();
2372
2373        #[rustfmt::skip]
2374        const IDXS: [u32; 16] = [
2375            00, 02, 04, 06, // a-lo i32 to i16 conversions
2376            16, 18, 20, 22, // b-lo
2377            08, 10, 12, 14, // a-hi
2378            24, 26, 28, 30, // b-hi
2379        ];
2380        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
2381
2382        result.as_m256i()
2383    }
2384}
2385
2386/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
2387/// using unsigned saturation
2388///
2389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2390#[inline]
2391#[target_feature(enable = "avx2")]
2392#[cfg_attr(test, assert_instr(vpackuswb))]
2393#[stable(feature = "simd_x86", since = "1.27.0")]
2394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2395pub const fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2396    unsafe {
2397        let max = simd_splat(u8::MAX as i16);
2398        let min = simd_splat(u8::MIN as i16);
2399
2400        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
2401            .as_m256i()
2402            .as_i8x32();
2403        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
2404            .as_m256i()
2405            .as_i8x32();
2406
2407        #[rustfmt::skip]
2408        const IDXS: [u32; 32] = [
2409            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to u8 conversions
2410            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
2411            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
2412            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
2413        ];
2414        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
2415
2416        result.as_m256i()
2417    }
2418}
2419
2420/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
2421/// using unsigned saturation
2422///
2423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2424#[inline]
2425#[target_feature(enable = "avx2")]
2426#[cfg_attr(test, assert_instr(vpackusdw))]
2427#[stable(feature = "simd_x86", since = "1.27.0")]
2428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2429pub const fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2430    unsafe {
2431        let max = simd_splat(u16::MAX as i32);
2432        let min = simd_splat(u16::MIN as i32);
2433
2434        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
2435            .as_m256i()
2436            .as_i16x16();
2437        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
2438            .as_m256i()
2439            .as_i16x16();
2440
2441        #[rustfmt::skip]
2442        const IDXS: [u32; 16] = [
2443            00, 02, 04, 06, // a-lo i32 to u16 conversions
2444            16, 18, 20, 22, // b-lo
2445            08, 10, 12, 14, // a-hi
2446            24, 26, 28, 30, // b-hi
2447        ];
2448        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
2449
2450        result.as_m256i()
2451    }
2452}
2453
2454/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2455///
2456/// The last 3 bits of each integer of `b` are used as addresses into the 8
2457/// integers of `a`.
2458///
2459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2460#[inline]
2461#[target_feature(enable = "avx2")]
2462#[cfg_attr(test, assert_instr(vpermps))]
2463#[stable(feature = "simd_x86", since = "1.27.0")]
2464pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2465    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2466}
2467
2468/// Permutes 64-bit integers from `a` using control mask `imm8`.
2469///
2470/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2471#[inline]
2472#[target_feature(enable = "avx2")]
2473#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2474#[rustc_legacy_const_generics(1)]
2475#[stable(feature = "simd_x86", since = "1.27.0")]
2476#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2477pub const fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2478    static_assert_uimm_bits!(IMM8, 8);
2479    unsafe {
2480        let zero = i64x4::ZERO;
2481        let r: i64x4 = simd_shuffle!(
2482            a.as_i64x4(),
2483            zero,
2484            [
2485                IMM8 as u32 & 0b11,
2486                (IMM8 as u32 >> 2) & 0b11,
2487                (IMM8 as u32 >> 4) & 0b11,
2488                (IMM8 as u32 >> 6) & 0b11,
2489            ],
2490        );
2491        transmute(r)
2492    }
2493}
2494
2495/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2496///
2497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2498#[inline]
2499#[target_feature(enable = "avx2")]
2500#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2501#[rustc_legacy_const_generics(2)]
2502#[stable(feature = "simd_x86", since = "1.27.0")]
2503#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2504pub const fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2505    static_assert_uimm_bits!(IMM8, 8);
2506    _mm256_permute2f128_si256::<IMM8>(a, b)
2507}
2508
2509/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2510/// control in `imm8`.
2511///
2512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2513#[inline]
2514#[target_feature(enable = "avx2")]
2515#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2516#[rustc_legacy_const_generics(1)]
2517#[stable(feature = "simd_x86", since = "1.27.0")]
2518#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2519pub const fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2520    static_assert_uimm_bits!(IMM8, 8);
2521    unsafe {
2522        simd_shuffle!(
2523            a,
2524            _mm256_undefined_pd(),
2525            [
2526                IMM8 as u32 & 0b11,
2527                (IMM8 as u32 >> 2) & 0b11,
2528                (IMM8 as u32 >> 4) & 0b11,
2529                (IMM8 as u32 >> 6) & 0b11,
2530            ],
2531        )
2532    }
2533}
2534
2535/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2536/// the corresponding 32-bit integer index in `idx`.
2537///
2538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2539#[inline]
2540#[target_feature(enable = "avx2")]
2541#[cfg_attr(test, assert_instr(vpermps))]
2542#[stable(feature = "simd_x86", since = "1.27.0")]
2543pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2544    unsafe { permps(a, idx.as_i32x8()) }
2545}
2546
2547/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2548/// and `b`, then horizontally sum each consecutive 8 differences to
2549/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2550/// integers in the low 16 bits of the 64-bit return value
2551///
2552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2553#[inline]
2554#[target_feature(enable = "avx2")]
2555#[cfg_attr(test, assert_instr(vpsadbw))]
2556#[stable(feature = "simd_x86", since = "1.27.0")]
2557pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2558    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2559}
2560
2561/// Shuffles bytes from `a` according to the content of `b`.
2562///
2563/// For each of the 128-bit low and high halves of the vectors, the last
2564/// 4 bits of each byte of `b` are used as addresses into the respective
2565/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2566///
2567/// In addition, if the highest significant bit of a byte of `b` is set, the
2568/// respective destination byte is set to 0.
2569///
2570/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2571/// equivalent to:
2572///
2573/// ```
2574/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2575///     let mut r = [0; 32];
2576///     for i in 0..16 {
2577///         // if the most significant bit of b is set,
2578///         // then the destination byte is set to 0.
2579///         if b[i] & 0x80 == 0u8 {
2580///             r[i] = a[(b[i] % 16) as usize];
2581///         }
2582///         if b[i + 16] & 0x80 == 0u8 {
2583///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2584///         }
2585///     }
2586///     r
2587/// }
2588/// ```
2589///
2590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2591#[inline]
2592#[target_feature(enable = "avx2")]
2593#[cfg_attr(test, assert_instr(vpshufb))]
2594#[stable(feature = "simd_x86", since = "1.27.0")]
2595pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2596    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2597}
2598
2599/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2600/// `imm8`.
2601///
2602/// ```rust
2603/// #[cfg(target_arch = "x86")]
2604/// use std::arch::x86::*;
2605/// #[cfg(target_arch = "x86_64")]
2606/// use std::arch::x86_64::*;
2607///
2608/// # fn main() {
2609/// #     if is_x86_feature_detected!("avx2") {
2610/// #         #[target_feature(enable = "avx2")]
2611/// #         unsafe fn worker() {
2612/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2613///
2614/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2615/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2616///
2617/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2618/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2619///
2620/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2621/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2622/// #         }
2623/// #         unsafe { worker(); }
2624/// #     }
2625/// # }
2626/// ```
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2629#[inline]
2630#[target_feature(enable = "avx2")]
2631#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2632#[rustc_legacy_const_generics(1)]
2633#[stable(feature = "simd_x86", since = "1.27.0")]
2634#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2635pub const fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2636    static_assert_uimm_bits!(MASK, 8);
2637    unsafe {
2638        let r: i32x8 = simd_shuffle!(
2639            a.as_i32x8(),
2640            a.as_i32x8(),
2641            [
2642                MASK as u32 & 0b11,
2643                (MASK as u32 >> 2) & 0b11,
2644                (MASK as u32 >> 4) & 0b11,
2645                (MASK as u32 >> 6) & 0b11,
2646                (MASK as u32 & 0b11) + 4,
2647                ((MASK as u32 >> 2) & 0b11) + 4,
2648                ((MASK as u32 >> 4) & 0b11) + 4,
2649                ((MASK as u32 >> 6) & 0b11) + 4,
2650            ],
2651        );
2652        transmute(r)
2653    }
2654}
2655
2656/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2657/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2658/// to the output.
2659///
2660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2661#[inline]
2662#[target_feature(enable = "avx2")]
2663#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2664#[rustc_legacy_const_generics(1)]
2665#[stable(feature = "simd_x86", since = "1.27.0")]
2666#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2667pub const fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2668    static_assert_uimm_bits!(IMM8, 8);
2669    unsafe {
2670        let a = a.as_i16x16();
2671        let r: i16x16 = simd_shuffle!(
2672            a,
2673            a,
2674            [
2675                0,
2676                1,
2677                2,
2678                3,
2679                4 + (IMM8 as u32 & 0b11),
2680                4 + ((IMM8 as u32 >> 2) & 0b11),
2681                4 + ((IMM8 as u32 >> 4) & 0b11),
2682                4 + ((IMM8 as u32 >> 6) & 0b11),
2683                8,
2684                9,
2685                10,
2686                11,
2687                12 + (IMM8 as u32 & 0b11),
2688                12 + ((IMM8 as u32 >> 2) & 0b11),
2689                12 + ((IMM8 as u32 >> 4) & 0b11),
2690                12 + ((IMM8 as u32 >> 6) & 0b11),
2691            ],
2692        );
2693        transmute(r)
2694    }
2695}
2696
2697/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2698/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2699/// to the output.
2700///
2701/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2702#[inline]
2703#[target_feature(enable = "avx2")]
2704#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2705#[rustc_legacy_const_generics(1)]
2706#[stable(feature = "simd_x86", since = "1.27.0")]
2707#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2708pub const fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2709    static_assert_uimm_bits!(IMM8, 8);
2710    unsafe {
2711        let a = a.as_i16x16();
2712        let r: i16x16 = simd_shuffle!(
2713            a,
2714            a,
2715            [
2716                0 + (IMM8 as u32 & 0b11),
2717                0 + ((IMM8 as u32 >> 2) & 0b11),
2718                0 + ((IMM8 as u32 >> 4) & 0b11),
2719                0 + ((IMM8 as u32 >> 6) & 0b11),
2720                4,
2721                5,
2722                6,
2723                7,
2724                8 + (IMM8 as u32 & 0b11),
2725                8 + ((IMM8 as u32 >> 2) & 0b11),
2726                8 + ((IMM8 as u32 >> 4) & 0b11),
2727                8 + ((IMM8 as u32 >> 6) & 0b11),
2728                12,
2729                13,
2730                14,
2731                15,
2732            ],
2733        );
2734        transmute(r)
2735    }
2736}
2737
2738/// Negates packed 16-bit integers in `a` when the corresponding signed
2739/// 16-bit integer in `b` is negative, and returns the results.
2740/// Results are zeroed out when the corresponding element in `b` is zero.
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2743#[inline]
2744#[target_feature(enable = "avx2")]
2745#[cfg_attr(test, assert_instr(vpsignw))]
2746#[stable(feature = "simd_x86", since = "1.27.0")]
2747pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2748    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2749}
2750
2751/// Negates packed 32-bit integers in `a` when the corresponding signed
2752/// 32-bit integer in `b` is negative, and returns the results.
2753/// Results are zeroed out when the corresponding element in `b` is zero.
2754///
2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2756#[inline]
2757#[target_feature(enable = "avx2")]
2758#[cfg_attr(test, assert_instr(vpsignd))]
2759#[stable(feature = "simd_x86", since = "1.27.0")]
2760pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2761    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2762}
2763
2764/// Negates packed 8-bit integers in `a` when the corresponding signed
2765/// 8-bit integer in `b` is negative, and returns the results.
2766/// Results are zeroed out when the corresponding element in `b` is zero.
2767///
2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2769#[inline]
2770#[target_feature(enable = "avx2")]
2771#[cfg_attr(test, assert_instr(vpsignb))]
2772#[stable(feature = "simd_x86", since = "1.27.0")]
2773pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2774    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2775}
2776
2777/// Shifts packed 16-bit integers in `a` left by `count` while
2778/// shifting in zeros, and returns the result
2779///
2780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2781#[inline]
2782#[target_feature(enable = "avx2")]
2783#[cfg_attr(test, assert_instr(vpsllw))]
2784#[stable(feature = "simd_x86", since = "1.27.0")]
2785pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2786    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2787}
2788
2789/// Shifts packed 32-bit integers in `a` left by `count` while
2790/// shifting in zeros, and returns the result
2791///
2792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2793#[inline]
2794#[target_feature(enable = "avx2")]
2795#[cfg_attr(test, assert_instr(vpslld))]
2796#[stable(feature = "simd_x86", since = "1.27.0")]
2797pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2798    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2799}
2800
2801/// Shifts packed 64-bit integers in `a` left by `count` while
2802/// shifting in zeros, and returns the result
2803///
2804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2805#[inline]
2806#[target_feature(enable = "avx2")]
2807#[cfg_attr(test, assert_instr(vpsllq))]
2808#[stable(feature = "simd_x86", since = "1.27.0")]
2809pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2810    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2811}
2812
2813/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2814/// shifting in zeros, return the results;
2815///
2816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2817#[inline]
2818#[target_feature(enable = "avx2")]
2819#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2820#[rustc_legacy_const_generics(1)]
2821#[stable(feature = "simd_x86", since = "1.27.0")]
2822#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2823pub const fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2824    static_assert_uimm_bits!(IMM8, 8);
2825    unsafe {
2826        if IMM8 >= 16 {
2827            _mm256_setzero_si256()
2828        } else {
2829            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2830        }
2831    }
2832}
2833
2834/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2835/// shifting in zeros, return the results;
2836///
2837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2838#[inline]
2839#[target_feature(enable = "avx2")]
2840#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2841#[rustc_legacy_const_generics(1)]
2842#[stable(feature = "simd_x86", since = "1.27.0")]
2843#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2844pub const fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2845    unsafe {
2846        static_assert_uimm_bits!(IMM8, 8);
2847        if IMM8 >= 32 {
2848            _mm256_setzero_si256()
2849        } else {
2850            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2851        }
2852    }
2853}
2854
2855/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2856/// shifting in zeros, return the results;
2857///
2858/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2859#[inline]
2860#[target_feature(enable = "avx2")]
2861#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2862#[rustc_legacy_const_generics(1)]
2863#[stable(feature = "simd_x86", since = "1.27.0")]
2864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2865pub const fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2866    unsafe {
2867        static_assert_uimm_bits!(IMM8, 8);
2868        if IMM8 >= 64 {
2869            _mm256_setzero_si256()
2870        } else {
2871            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2872        }
2873    }
2874}
2875
2876/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2877///
2878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2879#[inline]
2880#[target_feature(enable = "avx2")]
2881#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2882#[rustc_legacy_const_generics(1)]
2883#[stable(feature = "simd_x86", since = "1.27.0")]
2884#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2885pub const fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2886    static_assert_uimm_bits!(IMM8, 8);
2887    _mm256_bslli_epi128::<IMM8>(a)
2888}
2889
2890/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2891///
2892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2893#[inline]
2894#[target_feature(enable = "avx2")]
2895#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2896#[rustc_legacy_const_generics(1)]
2897#[stable(feature = "simd_x86", since = "1.27.0")]
2898#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2899pub const fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2900    static_assert_uimm_bits!(IMM8, 8);
2901    const fn mask(shift: i32, i: u32) -> u32 {
2902        let shift = shift as u32 & 0xff;
2903        if shift > 15 || i % 16 < shift {
2904            0
2905        } else {
2906            32 + (i - shift)
2907        }
2908    }
2909    unsafe {
2910        let a = a.as_i8x32();
2911        let r: i8x32 = simd_shuffle!(
2912            i8x32::ZERO,
2913            a,
2914            [
2915                mask(IMM8, 0),
2916                mask(IMM8, 1),
2917                mask(IMM8, 2),
2918                mask(IMM8, 3),
2919                mask(IMM8, 4),
2920                mask(IMM8, 5),
2921                mask(IMM8, 6),
2922                mask(IMM8, 7),
2923                mask(IMM8, 8),
2924                mask(IMM8, 9),
2925                mask(IMM8, 10),
2926                mask(IMM8, 11),
2927                mask(IMM8, 12),
2928                mask(IMM8, 13),
2929                mask(IMM8, 14),
2930                mask(IMM8, 15),
2931                mask(IMM8, 16),
2932                mask(IMM8, 17),
2933                mask(IMM8, 18),
2934                mask(IMM8, 19),
2935                mask(IMM8, 20),
2936                mask(IMM8, 21),
2937                mask(IMM8, 22),
2938                mask(IMM8, 23),
2939                mask(IMM8, 24),
2940                mask(IMM8, 25),
2941                mask(IMM8, 26),
2942                mask(IMM8, 27),
2943                mask(IMM8, 28),
2944                mask(IMM8, 29),
2945                mask(IMM8, 30),
2946                mask(IMM8, 31),
2947            ],
2948        );
2949        transmute(r)
2950    }
2951}
2952
2953/// Shifts packed 32-bit integers in `a` left by the amount
2954/// specified by the corresponding element in `count` while
2955/// shifting in zeros, and returns the result.
2956///
2957/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2958#[inline]
2959#[target_feature(enable = "avx2")]
2960#[cfg_attr(test, assert_instr(vpsllvd))]
2961#[stable(feature = "simd_x86", since = "1.27.0")]
2962#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2963pub const fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2964    unsafe {
2965        let count = count.as_u32x4();
2966        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2967        let count = simd_select(no_overflow, count, u32x4::ZERO);
2968        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2969    }
2970}
2971
2972/// Shifts packed 32-bit integers in `a` left by the amount
2973/// specified by the corresponding element in `count` while
2974/// shifting in zeros, and returns the result.
2975///
2976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2977#[inline]
2978#[target_feature(enable = "avx2")]
2979#[cfg_attr(test, assert_instr(vpsllvd))]
2980#[stable(feature = "simd_x86", since = "1.27.0")]
2981#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2982pub const fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2983    unsafe {
2984        let count = count.as_u32x8();
2985        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2986        let count = simd_select(no_overflow, count, u32x8::ZERO);
2987        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2988    }
2989}
2990
2991/// Shifts packed 64-bit integers in `a` left by the amount
2992/// specified by the corresponding element in `count` while
2993/// shifting in zeros, and returns the result.
2994///
2995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2996#[inline]
2997#[target_feature(enable = "avx2")]
2998#[cfg_attr(test, assert_instr(vpsllvq))]
2999#[stable(feature = "simd_x86", since = "1.27.0")]
3000#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3001pub const fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3002    unsafe {
3003        let count = count.as_u64x2();
3004        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3005        let count = simd_select(no_overflow, count, u64x2::ZERO);
3006        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3007    }
3008}
3009
3010/// Shifts packed 64-bit integers in `a` left by the amount
3011/// specified by the corresponding element in `count` while
3012/// shifting in zeros, and returns the result.
3013///
3014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
3015#[inline]
3016#[target_feature(enable = "avx2")]
3017#[cfg_attr(test, assert_instr(vpsllvq))]
3018#[stable(feature = "simd_x86", since = "1.27.0")]
3019#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3020pub const fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3021    unsafe {
3022        let count = count.as_u64x4();
3023        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3024        let count = simd_select(no_overflow, count, u64x4::ZERO);
3025        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3026    }
3027}
3028
3029/// Shifts packed 16-bit integers in `a` right by `count` while
3030/// shifting in sign bits.
3031///
3032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
3033#[inline]
3034#[target_feature(enable = "avx2")]
3035#[cfg_attr(test, assert_instr(vpsraw))]
3036#[stable(feature = "simd_x86", since = "1.27.0")]
3037pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3038    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
3039}
3040
3041/// Shifts packed 32-bit integers in `a` right by `count` while
3042/// shifting in sign bits.
3043///
3044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
3045#[inline]
3046#[target_feature(enable = "avx2")]
3047#[cfg_attr(test, assert_instr(vpsrad))]
3048#[stable(feature = "simd_x86", since = "1.27.0")]
3049pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3050    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
3051}
3052
3053/// Shifts packed 16-bit integers in `a` right by `IMM8` while
3054/// shifting in sign bits.
3055///
3056/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
3057#[inline]
3058#[target_feature(enable = "avx2")]
3059#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
3060#[rustc_legacy_const_generics(1)]
3061#[stable(feature = "simd_x86", since = "1.27.0")]
3062#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3063pub const fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3064    static_assert_uimm_bits!(IMM8, 8);
3065    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
3066}
3067
3068/// Shifts packed 32-bit integers in `a` right by `IMM8` while
3069/// shifting in sign bits.
3070///
3071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
3072#[inline]
3073#[target_feature(enable = "avx2")]
3074#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
3075#[rustc_legacy_const_generics(1)]
3076#[stable(feature = "simd_x86", since = "1.27.0")]
3077#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3078pub const fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3079    static_assert_uimm_bits!(IMM8, 8);
3080    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
3081}
3082
3083/// Shifts packed 32-bit integers in `a` right by the amount specified by the
3084/// corresponding element in `count` while shifting in sign bits.
3085///
3086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
3087#[inline]
3088#[target_feature(enable = "avx2")]
3089#[cfg_attr(test, assert_instr(vpsravd))]
3090#[stable(feature = "simd_x86", since = "1.27.0")]
3091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3092pub const fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3093    unsafe {
3094        let count = count.as_u32x4();
3095        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3096        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
3097        simd_shr(a.as_i32x4(), count).as_m128i()
3098    }
3099}
3100
3101/// Shifts packed 32-bit integers in `a` right by the amount specified by the
3102/// corresponding element in `count` while shifting in sign bits.
3103///
3104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
3105#[inline]
3106#[target_feature(enable = "avx2")]
3107#[cfg_attr(test, assert_instr(vpsravd))]
3108#[stable(feature = "simd_x86", since = "1.27.0")]
3109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3110pub const fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3111    unsafe {
3112        let count = count.as_u32x8();
3113        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3114        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
3115        simd_shr(a.as_i32x8(), count).as_m256i()
3116    }
3117}
3118
3119/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3120///
3121/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
3122#[inline]
3123#[target_feature(enable = "avx2")]
3124#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3125#[rustc_legacy_const_generics(1)]
3126#[stable(feature = "simd_x86", since = "1.27.0")]
3127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3128pub const fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
3129    static_assert_uimm_bits!(IMM8, 8);
3130    _mm256_bsrli_epi128::<IMM8>(a)
3131}
3132
3133/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3134///
3135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
3136#[inline]
3137#[target_feature(enable = "avx2")]
3138#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3139#[rustc_legacy_const_generics(1)]
3140#[stable(feature = "simd_x86", since = "1.27.0")]
3141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3142pub const fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
3143    static_assert_uimm_bits!(IMM8, 8);
3144    const fn mask(shift: i32, i: u32) -> u32 {
3145        let shift = shift as u32 & 0xff;
3146        if shift > 15 || (15 - (i % 16)) < shift {
3147            0
3148        } else {
3149            32 + (i + shift)
3150        }
3151    }
3152    unsafe {
3153        let a = a.as_i8x32();
3154        let r: i8x32 = simd_shuffle!(
3155            i8x32::ZERO,
3156            a,
3157            [
3158                mask(IMM8, 0),
3159                mask(IMM8, 1),
3160                mask(IMM8, 2),
3161                mask(IMM8, 3),
3162                mask(IMM8, 4),
3163                mask(IMM8, 5),
3164                mask(IMM8, 6),
3165                mask(IMM8, 7),
3166                mask(IMM8, 8),
3167                mask(IMM8, 9),
3168                mask(IMM8, 10),
3169                mask(IMM8, 11),
3170                mask(IMM8, 12),
3171                mask(IMM8, 13),
3172                mask(IMM8, 14),
3173                mask(IMM8, 15),
3174                mask(IMM8, 16),
3175                mask(IMM8, 17),
3176                mask(IMM8, 18),
3177                mask(IMM8, 19),
3178                mask(IMM8, 20),
3179                mask(IMM8, 21),
3180                mask(IMM8, 22),
3181                mask(IMM8, 23),
3182                mask(IMM8, 24),
3183                mask(IMM8, 25),
3184                mask(IMM8, 26),
3185                mask(IMM8, 27),
3186                mask(IMM8, 28),
3187                mask(IMM8, 29),
3188                mask(IMM8, 30),
3189                mask(IMM8, 31),
3190            ],
3191        );
3192        transmute(r)
3193    }
3194}
3195
3196/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3197/// zeros.
3198///
3199/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3200#[inline]
3201#[target_feature(enable = "avx2")]
3202#[cfg_attr(test, assert_instr(vpsrlw))]
3203#[stable(feature = "simd_x86", since = "1.27.0")]
3204pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3205    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3206}
3207
3208/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3209/// zeros.
3210///
3211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3212#[inline]
3213#[target_feature(enable = "avx2")]
3214#[cfg_attr(test, assert_instr(vpsrld))]
3215#[stable(feature = "simd_x86", since = "1.27.0")]
3216pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3217    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3218}
3219
3220/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3221/// zeros.
3222///
3223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3224#[inline]
3225#[target_feature(enable = "avx2")]
3226#[cfg_attr(test, assert_instr(vpsrlq))]
3227#[stable(feature = "simd_x86", since = "1.27.0")]
3228pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3229    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3230}
3231
3232/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3233/// zeros
3234///
3235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3236#[inline]
3237#[target_feature(enable = "avx2")]
3238#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3239#[rustc_legacy_const_generics(1)]
3240#[stable(feature = "simd_x86", since = "1.27.0")]
3241#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3242pub const fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3243    static_assert_uimm_bits!(IMM8, 8);
3244    unsafe {
3245        if IMM8 >= 16 {
3246            _mm256_setzero_si256()
3247        } else {
3248            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3249        }
3250    }
3251}
3252
3253/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3254/// zeros
3255///
3256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3257#[inline]
3258#[target_feature(enable = "avx2")]
3259#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3260#[rustc_legacy_const_generics(1)]
3261#[stable(feature = "simd_x86", since = "1.27.0")]
3262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3263pub const fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3264    static_assert_uimm_bits!(IMM8, 8);
3265    unsafe {
3266        if IMM8 >= 32 {
3267            _mm256_setzero_si256()
3268        } else {
3269            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3270        }
3271    }
3272}
3273
3274/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3275/// zeros
3276///
3277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3278#[inline]
3279#[target_feature(enable = "avx2")]
3280#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3281#[rustc_legacy_const_generics(1)]
3282#[stable(feature = "simd_x86", since = "1.27.0")]
3283#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3284pub const fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3285    static_assert_uimm_bits!(IMM8, 8);
3286    unsafe {
3287        if IMM8 >= 64 {
3288            _mm256_setzero_si256()
3289        } else {
3290            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3291        }
3292    }
3293}
3294
3295/// Shifts packed 32-bit integers in `a` right by the amount specified by
3296/// the corresponding element in `count` while shifting in zeros,
3297///
3298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3299#[inline]
3300#[target_feature(enable = "avx2")]
3301#[cfg_attr(test, assert_instr(vpsrlvd))]
3302#[stable(feature = "simd_x86", since = "1.27.0")]
3303#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3304pub const fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3305    unsafe {
3306        let count = count.as_u32x4();
3307        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3308        let count = simd_select(no_overflow, count, u32x4::ZERO);
3309        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3310    }
3311}
3312
3313/// Shifts packed 32-bit integers in `a` right by the amount specified by
3314/// the corresponding element in `count` while shifting in zeros,
3315///
3316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3317#[inline]
3318#[target_feature(enable = "avx2")]
3319#[cfg_attr(test, assert_instr(vpsrlvd))]
3320#[stable(feature = "simd_x86", since = "1.27.0")]
3321#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3322pub const fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3323    unsafe {
3324        let count = count.as_u32x8();
3325        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3326        let count = simd_select(no_overflow, count, u32x8::ZERO);
3327        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3328    }
3329}
3330
3331/// Shifts packed 64-bit integers in `a` right by the amount specified by
3332/// the corresponding element in `count` while shifting in zeros,
3333///
3334/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3335#[inline]
3336#[target_feature(enable = "avx2")]
3337#[cfg_attr(test, assert_instr(vpsrlvq))]
3338#[stable(feature = "simd_x86", since = "1.27.0")]
3339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3340pub const fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3341    unsafe {
3342        let count = count.as_u64x2();
3343        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3344        let count = simd_select(no_overflow, count, u64x2::ZERO);
3345        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3346    }
3347}
3348
3349/// Shifts packed 64-bit integers in `a` right by the amount specified by
3350/// the corresponding element in `count` while shifting in zeros,
3351///
3352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3353#[inline]
3354#[target_feature(enable = "avx2")]
3355#[cfg_attr(test, assert_instr(vpsrlvq))]
3356#[stable(feature = "simd_x86", since = "1.27.0")]
3357#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3358pub const fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3359    unsafe {
3360        let count = count.as_u64x4();
3361        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3362        let count = simd_select(no_overflow, count, u64x4::ZERO);
3363        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3364    }
3365}
3366
3367/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3368/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3369/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3370///
3371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3372#[inline]
3373#[target_feature(enable = "avx2")]
3374#[cfg_attr(test, assert_instr(vmovntdqa))]
3375#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3376pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3377    let dst: __m256i;
3378    crate::arch::asm!(
3379        vpl!("vmovntdqa {a}"),
3380        a = out(ymm_reg) dst,
3381        p = in(reg) mem_addr,
3382        options(pure, readonly, nostack, preserves_flags),
3383    );
3384    dst
3385}
3386
3387/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3388///
3389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3390#[inline]
3391#[target_feature(enable = "avx2")]
3392#[cfg_attr(test, assert_instr(vpsubw))]
3393#[stable(feature = "simd_x86", since = "1.27.0")]
3394#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3395pub const fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3396    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3397}
3398
3399/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3400///
3401/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3402#[inline]
3403#[target_feature(enable = "avx2")]
3404#[cfg_attr(test, assert_instr(vpsubd))]
3405#[stable(feature = "simd_x86", since = "1.27.0")]
3406#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3407pub const fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3408    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3409}
3410
3411/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3412///
3413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3414#[inline]
3415#[target_feature(enable = "avx2")]
3416#[cfg_attr(test, assert_instr(vpsubq))]
3417#[stable(feature = "simd_x86", since = "1.27.0")]
3418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3419pub const fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3420    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3421}
3422
3423/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3424///
3425/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3426#[inline]
3427#[target_feature(enable = "avx2")]
3428#[cfg_attr(test, assert_instr(vpsubb))]
3429#[stable(feature = "simd_x86", since = "1.27.0")]
3430#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3431pub const fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3432    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3433}
3434
3435/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3436/// `a` using saturation.
3437///
3438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3439#[inline]
3440#[target_feature(enable = "avx2")]
3441#[cfg_attr(test, assert_instr(vpsubsw))]
3442#[stable(feature = "simd_x86", since = "1.27.0")]
3443#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3444pub const fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3445    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3446}
3447
3448/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3449/// `a` using saturation.
3450///
3451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3452#[inline]
3453#[target_feature(enable = "avx2")]
3454#[cfg_attr(test, assert_instr(vpsubsb))]
3455#[stable(feature = "simd_x86", since = "1.27.0")]
3456#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3457pub const fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3458    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3459}
3460
3461/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3462/// integers in `a` using saturation.
3463///
3464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3465#[inline]
3466#[target_feature(enable = "avx2")]
3467#[cfg_attr(test, assert_instr(vpsubusw))]
3468#[stable(feature = "simd_x86", since = "1.27.0")]
3469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3470pub const fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3471    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3472}
3473
3474/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3475/// integers in `a` using saturation.
3476///
3477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3478#[inline]
3479#[target_feature(enable = "avx2")]
3480#[cfg_attr(test, assert_instr(vpsubusb))]
3481#[stable(feature = "simd_x86", since = "1.27.0")]
3482#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3483pub const fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3484    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3485}
3486
3487/// Unpacks and interleave 8-bit integers from the high half of each
3488/// 128-bit lane in `a` and `b`.
3489///
3490/// ```rust
3491/// #[cfg(target_arch = "x86")]
3492/// use std::arch::x86::*;
3493/// #[cfg(target_arch = "x86_64")]
3494/// use std::arch::x86_64::*;
3495///
3496/// # fn main() {
3497/// #     if is_x86_feature_detected!("avx2") {
3498/// #         #[target_feature(enable = "avx2")]
3499/// #         unsafe fn worker() {
3500/// let a = _mm256_setr_epi8(
3501///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3502///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3503/// );
3504/// let b = _mm256_setr_epi8(
3505///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3506///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3507///     -30, -31,
3508/// );
3509///
3510/// let c = _mm256_unpackhi_epi8(a, b);
3511///
3512/// let expected = _mm256_setr_epi8(
3513///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3514///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3515///     -31,
3516/// );
3517/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3518///
3519/// #         }
3520/// #         unsafe { worker(); }
3521/// #     }
3522/// # }
3523/// ```
3524///
3525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3526#[inline]
3527#[target_feature(enable = "avx2")]
3528#[cfg_attr(test, assert_instr(vpunpckhbw))]
3529#[stable(feature = "simd_x86", since = "1.27.0")]
3530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3531pub const fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3532    unsafe {
3533        #[rustfmt::skip]
3534        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3535                8, 40, 9, 41, 10, 42, 11, 43,
3536                12, 44, 13, 45, 14, 46, 15, 47,
3537                24, 56, 25, 57, 26, 58, 27, 59,
3538                28, 60, 29, 61, 30, 62, 31, 63,
3539        ]);
3540        transmute(r)
3541    }
3542}
3543
3544/// Unpacks and interleave 8-bit integers from the low half of each
3545/// 128-bit lane of `a` and `b`.
3546///
3547/// ```rust
3548/// #[cfg(target_arch = "x86")]
3549/// use std::arch::x86::*;
3550/// #[cfg(target_arch = "x86_64")]
3551/// use std::arch::x86_64::*;
3552///
3553/// # fn main() {
3554/// #     if is_x86_feature_detected!("avx2") {
3555/// #         #[target_feature(enable = "avx2")]
3556/// #         unsafe fn worker() {
3557/// let a = _mm256_setr_epi8(
3558///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3559///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3560/// );
3561/// let b = _mm256_setr_epi8(
3562///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3563///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3564///     -30, -31,
3565/// );
3566///
3567/// let c = _mm256_unpacklo_epi8(a, b);
3568///
3569/// let expected = _mm256_setr_epi8(
3570///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3571///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3572/// );
3573/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3574///
3575/// #         }
3576/// #         unsafe { worker(); }
3577/// #     }
3578/// # }
3579/// ```
3580///
3581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3582#[inline]
3583#[target_feature(enable = "avx2")]
3584#[cfg_attr(test, assert_instr(vpunpcklbw))]
3585#[stable(feature = "simd_x86", since = "1.27.0")]
3586#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3587pub const fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3588    unsafe {
3589        #[rustfmt::skip]
3590        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3591            0, 32, 1, 33, 2, 34, 3, 35,
3592            4, 36, 5, 37, 6, 38, 7, 39,
3593            16, 48, 17, 49, 18, 50, 19, 51,
3594            20, 52, 21, 53, 22, 54, 23, 55,
3595        ]);
3596        transmute(r)
3597    }
3598}
3599
3600/// Unpacks and interleave 16-bit integers from the high half of each
3601/// 128-bit lane of `a` and `b`.
3602///
3603/// ```rust
3604/// #[cfg(target_arch = "x86")]
3605/// use std::arch::x86::*;
3606/// #[cfg(target_arch = "x86_64")]
3607/// use std::arch::x86_64::*;
3608///
3609/// # fn main() {
3610/// #     if is_x86_feature_detected!("avx2") {
3611/// #         #[target_feature(enable = "avx2")]
3612/// #         unsafe fn worker() {
3613/// let a = _mm256_setr_epi16(
3614///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3615/// );
3616/// let b = _mm256_setr_epi16(
3617///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3618/// );
3619///
3620/// let c = _mm256_unpackhi_epi16(a, b);
3621///
3622/// let expected = _mm256_setr_epi16(
3623///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3624/// );
3625/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3626///
3627/// #         }
3628/// #         unsafe { worker(); }
3629/// #     }
3630/// # }
3631/// ```
3632///
3633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3634#[inline]
3635#[target_feature(enable = "avx2")]
3636#[cfg_attr(test, assert_instr(vpunpckhwd))]
3637#[stable(feature = "simd_x86", since = "1.27.0")]
3638#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3639pub const fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3640    unsafe {
3641        let r: i16x16 = simd_shuffle!(
3642            a.as_i16x16(),
3643            b.as_i16x16(),
3644            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3645        );
3646        transmute(r)
3647    }
3648}
3649
3650/// Unpacks and interleave 16-bit integers from the low half of each
3651/// 128-bit lane of `a` and `b`.
3652///
3653/// ```rust
3654/// #[cfg(target_arch = "x86")]
3655/// use std::arch::x86::*;
3656/// #[cfg(target_arch = "x86_64")]
3657/// use std::arch::x86_64::*;
3658///
3659/// # fn main() {
3660/// #     if is_x86_feature_detected!("avx2") {
3661/// #         #[target_feature(enable = "avx2")]
3662/// #         unsafe fn worker() {
3663///
3664/// let a = _mm256_setr_epi16(
3665///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3666/// );
3667/// let b = _mm256_setr_epi16(
3668///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3669/// );
3670///
3671/// let c = _mm256_unpacklo_epi16(a, b);
3672///
3673/// let expected = _mm256_setr_epi16(
3674///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3675/// );
3676/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3677///
3678/// #         }
3679/// #         unsafe { worker(); }
3680/// #     }
3681/// # }
3682/// ```
3683///
3684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3685#[inline]
3686#[target_feature(enable = "avx2")]
3687#[cfg_attr(test, assert_instr(vpunpcklwd))]
3688#[stable(feature = "simd_x86", since = "1.27.0")]
3689#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3690pub const fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3691    unsafe {
3692        let r: i16x16 = simd_shuffle!(
3693            a.as_i16x16(),
3694            b.as_i16x16(),
3695            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3696        );
3697        transmute(r)
3698    }
3699}
3700
3701/// Unpacks and interleave 32-bit integers from the high half of each
3702/// 128-bit lane of `a` and `b`.
3703///
3704/// ```rust
3705/// #[cfg(target_arch = "x86")]
3706/// use std::arch::x86::*;
3707/// #[cfg(target_arch = "x86_64")]
3708/// use std::arch::x86_64::*;
3709///
3710/// # fn main() {
3711/// #     if is_x86_feature_detected!("avx2") {
3712/// #         #[target_feature(enable = "avx2")]
3713/// #         unsafe fn worker() {
3714/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3715/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3716///
3717/// let c = _mm256_unpackhi_epi32(a, b);
3718///
3719/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3720/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3721///
3722/// #         }
3723/// #         unsafe { worker(); }
3724/// #     }
3725/// # }
3726/// ```
3727///
3728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3729#[inline]
3730#[target_feature(enable = "avx2")]
3731#[cfg_attr(test, assert_instr(vunpckhps))]
3732#[stable(feature = "simd_x86", since = "1.27.0")]
3733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3734pub const fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3735    unsafe {
3736        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3737        transmute(r)
3738    }
3739}
3740
3741/// Unpacks and interleave 32-bit integers from the low half of each
3742/// 128-bit lane of `a` and `b`.
3743///
3744/// ```rust
3745/// #[cfg(target_arch = "x86")]
3746/// use std::arch::x86::*;
3747/// #[cfg(target_arch = "x86_64")]
3748/// use std::arch::x86_64::*;
3749///
3750/// # fn main() {
3751/// #     if is_x86_feature_detected!("avx2") {
3752/// #         #[target_feature(enable = "avx2")]
3753/// #         unsafe fn worker() {
3754/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3755/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3756///
3757/// let c = _mm256_unpacklo_epi32(a, b);
3758///
3759/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3760/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3761///
3762/// #         }
3763/// #         unsafe { worker(); }
3764/// #     }
3765/// # }
3766/// ```
3767///
3768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3769#[inline]
3770#[target_feature(enable = "avx2")]
3771#[cfg_attr(test, assert_instr(vunpcklps))]
3772#[stable(feature = "simd_x86", since = "1.27.0")]
3773#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3774pub const fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3775    unsafe {
3776        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3777        transmute(r)
3778    }
3779}
3780
3781/// Unpacks and interleave 64-bit integers from the high half of each
3782/// 128-bit lane of `a` and `b`.
3783///
3784/// ```rust
3785/// #[cfg(target_arch = "x86")]
3786/// use std::arch::x86::*;
3787/// #[cfg(target_arch = "x86_64")]
3788/// use std::arch::x86_64::*;
3789///
3790/// # fn main() {
3791/// #     if is_x86_feature_detected!("avx2") {
3792/// #         #[target_feature(enable = "avx2")]
3793/// #         unsafe fn worker() {
3794/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3795/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3796///
3797/// let c = _mm256_unpackhi_epi64(a, b);
3798///
3799/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3800/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3801///
3802/// #         }
3803/// #         unsafe { worker(); }
3804/// #     }
3805/// # }
3806/// ```
3807///
3808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3809#[inline]
3810#[target_feature(enable = "avx2")]
3811#[cfg_attr(test, assert_instr(vunpckhpd))]
3812#[stable(feature = "simd_x86", since = "1.27.0")]
3813#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3814pub const fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3815    unsafe {
3816        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3817        transmute(r)
3818    }
3819}
3820
3821/// Unpacks and interleave 64-bit integers from the low half of each
3822/// 128-bit lane of `a` and `b`.
3823///
3824/// ```rust
3825/// #[cfg(target_arch = "x86")]
3826/// use std::arch::x86::*;
3827/// #[cfg(target_arch = "x86_64")]
3828/// use std::arch::x86_64::*;
3829///
3830/// # fn main() {
3831/// #     if is_x86_feature_detected!("avx2") {
3832/// #         #[target_feature(enable = "avx2")]
3833/// #         unsafe fn worker() {
3834/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3835/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3836///
3837/// let c = _mm256_unpacklo_epi64(a, b);
3838///
3839/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3840/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3841///
3842/// #         }
3843/// #         unsafe { worker(); }
3844/// #     }
3845/// # }
3846/// ```
3847///
3848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3849#[inline]
3850#[target_feature(enable = "avx2")]
3851#[cfg_attr(test, assert_instr(vunpcklpd))]
3852#[stable(feature = "simd_x86", since = "1.27.0")]
3853#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3854pub const fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3855    unsafe {
3856        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3857        transmute(r)
3858    }
3859}
3860
3861/// Computes the bitwise XOR of 256 bits (representing integer data)
3862/// in `a` and `b`
3863///
3864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3865#[inline]
3866#[target_feature(enable = "avx2")]
3867#[cfg_attr(test, assert_instr(vxorps))]
3868#[stable(feature = "simd_x86", since = "1.27.0")]
3869#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3870pub const fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3871    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3872}
3873
3874/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3875/// integer containing the zero-extended integer data.
3876///
3877/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3878///
3879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3880#[inline]
3881#[target_feature(enable = "avx2")]
3882// This intrinsic has no corresponding instruction.
3883#[rustc_legacy_const_generics(1)]
3884#[stable(feature = "simd_x86", since = "1.27.0")]
3885#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3886pub const fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3887    static_assert_uimm_bits!(INDEX, 5);
3888    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3889}
3890
3891/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3892/// integer containing the zero-extended integer data.
3893///
3894/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3895///
3896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3897#[inline]
3898#[target_feature(enable = "avx2")]
3899// This intrinsic has no corresponding instruction.
3900#[rustc_legacy_const_generics(1)]
3901#[stable(feature = "simd_x86", since = "1.27.0")]
3902#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3903pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3904    static_assert_uimm_bits!(INDEX, 4);
3905    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3906}
3907
3908#[allow(improper_ctypes)]
3909unsafe extern "C" {
3910    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3911    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3912    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3913    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3914    #[link_name = "llvm.x86.avx2.mpsadbw"]
3915    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3916    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3917    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3918    #[link_name = "llvm.x86.avx2.psad.bw"]
3919    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3920    #[link_name = "llvm.x86.avx2.psign.b"]
3921    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3922    #[link_name = "llvm.x86.avx2.psign.w"]
3923    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3924    #[link_name = "llvm.x86.avx2.psign.d"]
3925    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3926    #[link_name = "llvm.x86.avx2.psll.w"]
3927    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3928    #[link_name = "llvm.x86.avx2.psll.d"]
3929    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3930    #[link_name = "llvm.x86.avx2.psll.q"]
3931    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3932    #[link_name = "llvm.x86.avx2.psra.w"]
3933    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3934    #[link_name = "llvm.x86.avx2.psra.d"]
3935    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3936    #[link_name = "llvm.x86.avx2.psrl.w"]
3937    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3938    #[link_name = "llvm.x86.avx2.psrl.d"]
3939    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3940    #[link_name = "llvm.x86.avx2.psrl.q"]
3941    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3942    #[link_name = "llvm.x86.avx2.pshuf.b"]
3943    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3944    #[link_name = "llvm.x86.avx2.permd"]
3945    fn permd(a: u32x8, b: u32x8) -> u32x8;
3946    #[link_name = "llvm.x86.avx2.permps"]
3947    fn permps(a: __m256, b: i32x8) -> __m256;
3948    #[link_name = "llvm.x86.avx2.gather.d.d"]
3949    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3950    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3951    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3952    #[link_name = "llvm.x86.avx2.gather.d.q"]
3953    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3954    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3955    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3956    #[link_name = "llvm.x86.avx2.gather.q.d"]
3957    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3958    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3959    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3960    #[link_name = "llvm.x86.avx2.gather.q.q"]
3961    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3962    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3963    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3964    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3965    fn pgatherdpd(
3966        src: __m128d,
3967        slice: *const i8,
3968        offsets: i32x4,
3969        mask: __m128d,
3970        scale: i8,
3971    ) -> __m128d;
3972    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3973    fn vpgatherdpd(
3974        src: __m256d,
3975        slice: *const i8,
3976        offsets: i32x4,
3977        mask: __m256d,
3978        scale: i8,
3979    ) -> __m256d;
3980    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3981    fn pgatherqpd(
3982        src: __m128d,
3983        slice: *const i8,
3984        offsets: i64x2,
3985        mask: __m128d,
3986        scale: i8,
3987    ) -> __m128d;
3988    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3989    fn vpgatherqpd(
3990        src: __m256d,
3991        slice: *const i8,
3992        offsets: i64x4,
3993        mask: __m256d,
3994        scale: i8,
3995    ) -> __m256d;
3996    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3997    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3998    -> __m128;
3999    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4000    fn vpgatherdps(
4001        src: __m256,
4002        slice: *const i8,
4003        offsets: i32x8,
4004        mask: __m256,
4005        scale: i8,
4006    ) -> __m256;
4007    #[link_name = "llvm.x86.avx2.gather.q.ps"]
4008    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4009    -> __m128;
4010    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4011    fn vpgatherqps(
4012        src: __m128,
4013        slice: *const i8,
4014        offsets: i64x4,
4015        mask: __m128,
4016        scale: i8,
4017    ) -> __m128;
4018}
4019
4020#[cfg(test)]
4021mod tests {
4022    use crate::core_arch::assert_eq_const as assert_eq;
4023
4024    use stdarch_test::simd_test;
4025
4026    use crate::core_arch::x86::*;
4027
4028    #[simd_test(enable = "avx2")]
4029    const fn test_mm256_abs_epi32() {
4030        #[rustfmt::skip]
4031        let a = _mm256_setr_epi32(
4032            0, 1, -1, i32::MAX,
4033            i32::MIN, 100, -100, -32,
4034        );
4035        let r = _mm256_abs_epi32(a);
4036        #[rustfmt::skip]
4037        let e = _mm256_setr_epi32(
4038            0, 1, 1, i32::MAX,
4039            i32::MAX.wrapping_add(1), 100, 100, 32,
4040        );
4041        assert_eq_m256i(r, e);
4042    }
4043
4044    #[simd_test(enable = "avx2")]
4045    const fn test_mm256_abs_epi16() {
4046        #[rustfmt::skip]
4047        let a = _mm256_setr_epi16(
4048            0,  1, -1, 2, -2, 3, -3, 4,
4049            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
4050        );
4051        let r = _mm256_abs_epi16(a);
4052        #[rustfmt::skip]
4053        let e = _mm256_setr_epi16(
4054            0, 1, 1, 2, 2, 3, 3, 4,
4055            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
4056        );
4057        assert_eq_m256i(r, e);
4058    }
4059
4060    #[simd_test(enable = "avx2")]
4061    const fn test_mm256_abs_epi8() {
4062        #[rustfmt::skip]
4063        let a = _mm256_setr_epi8(
4064            0, 1, -1, 2, -2, 3, -3, 4,
4065            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4066            0, 1, -1, 2, -2, 3, -3, 4,
4067            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4068        );
4069        let r = _mm256_abs_epi8(a);
4070        #[rustfmt::skip]
4071        let e = _mm256_setr_epi8(
4072            0, 1, 1, 2, 2, 3, 3, 4,
4073            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4074            0, 1, 1, 2, 2, 3, 3, 4,
4075            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4076        );
4077        assert_eq_m256i(r, e);
4078    }
4079
4080    #[simd_test(enable = "avx2")]
4081    const fn test_mm256_add_epi64() {
4082        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4083        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4084        let r = _mm256_add_epi64(a, b);
4085        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4086        assert_eq_m256i(r, e);
4087    }
4088
4089    #[simd_test(enable = "avx2")]
4090    const fn test_mm256_add_epi32() {
4091        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4092        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4093        let r = _mm256_add_epi32(a, b);
4094        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4095        assert_eq_m256i(r, e);
4096    }
4097
4098    #[simd_test(enable = "avx2")]
4099    const fn test_mm256_add_epi16() {
4100        #[rustfmt::skip]
4101        let a = _mm256_setr_epi16(
4102            0, 1, 2, 3, 4, 5, 6, 7,
4103            8, 9, 10, 11, 12, 13, 14, 15,
4104        );
4105        #[rustfmt::skip]
4106        let b = _mm256_setr_epi16(
4107            0, 1, 2, 3, 4, 5, 6, 7,
4108            8, 9, 10, 11, 12, 13, 14, 15,
4109        );
4110        let r = _mm256_add_epi16(a, b);
4111        #[rustfmt::skip]
4112        let e = _mm256_setr_epi16(
4113            0, 2, 4, 6, 8, 10, 12, 14,
4114            16, 18, 20, 22, 24, 26, 28, 30,
4115        );
4116        assert_eq_m256i(r, e);
4117    }
4118
4119    #[simd_test(enable = "avx2")]
4120    const fn test_mm256_add_epi8() {
4121        #[rustfmt::skip]
4122        let a = _mm256_setr_epi8(
4123            0, 1, 2, 3, 4, 5, 6, 7,
4124            8, 9, 10, 11, 12, 13, 14, 15,
4125            16, 17, 18, 19, 20, 21, 22, 23,
4126            24, 25, 26, 27, 28, 29, 30, 31,
4127        );
4128        #[rustfmt::skip]
4129        let b = _mm256_setr_epi8(
4130            0, 1, 2, 3, 4, 5, 6, 7,
4131            8, 9, 10, 11, 12, 13, 14, 15,
4132            16, 17, 18, 19, 20, 21, 22, 23,
4133            24, 25, 26, 27, 28, 29, 30, 31,
4134        );
4135        let r = _mm256_add_epi8(a, b);
4136        #[rustfmt::skip]
4137        let e = _mm256_setr_epi8(
4138            0, 2, 4, 6, 8, 10, 12, 14,
4139            16, 18, 20, 22, 24, 26, 28, 30,
4140            32, 34, 36, 38, 40, 42, 44, 46,
4141            48, 50, 52, 54, 56, 58, 60, 62,
4142        );
4143        assert_eq_m256i(r, e);
4144    }
4145
4146    #[simd_test(enable = "avx2")]
4147    const fn test_mm256_adds_epi8() {
4148        #[rustfmt::skip]
4149        let a = _mm256_setr_epi8(
4150            0, 1, 2, 3, 4, 5, 6, 7,
4151            8, 9, 10, 11, 12, 13, 14, 15,
4152            16, 17, 18, 19, 20, 21, 22, 23,
4153            24, 25, 26, 27, 28, 29, 30, 31,
4154        );
4155        #[rustfmt::skip]
4156        let b = _mm256_setr_epi8(
4157            32, 33, 34, 35, 36, 37, 38, 39,
4158            40, 41, 42, 43, 44, 45, 46, 47,
4159            48, 49, 50, 51, 52, 53, 54, 55,
4160            56, 57, 58, 59, 60, 61, 62, 63,
4161        );
4162        let r = _mm256_adds_epi8(a, b);
4163        #[rustfmt::skip]
4164        let e = _mm256_setr_epi8(
4165            32, 34, 36, 38, 40, 42, 44, 46,
4166            48, 50, 52, 54, 56, 58, 60, 62,
4167            64, 66, 68, 70, 72, 74, 76, 78,
4168            80, 82, 84, 86, 88, 90, 92, 94,
4169        );
4170        assert_eq_m256i(r, e);
4171    }
4172
4173    #[simd_test(enable = "avx2")]
4174    fn test_mm256_adds_epi8_saturate_positive() {
4175        let a = _mm256_set1_epi8(0x7F);
4176        let b = _mm256_set1_epi8(1);
4177        let r = _mm256_adds_epi8(a, b);
4178        assert_eq_m256i(r, a);
4179    }
4180
4181    #[simd_test(enable = "avx2")]
4182    fn test_mm256_adds_epi8_saturate_negative() {
4183        let a = _mm256_set1_epi8(-0x80);
4184        let b = _mm256_set1_epi8(-1);
4185        let r = _mm256_adds_epi8(a, b);
4186        assert_eq_m256i(r, a);
4187    }
4188
4189    #[simd_test(enable = "avx2")]
4190    const fn test_mm256_adds_epi16() {
4191        #[rustfmt::skip]
4192        let a = _mm256_setr_epi16(
4193            0, 1, 2, 3, 4, 5, 6, 7,
4194            8, 9, 10, 11, 12, 13, 14, 15,
4195        );
4196        #[rustfmt::skip]
4197        let b = _mm256_setr_epi16(
4198            32, 33, 34, 35, 36, 37, 38, 39,
4199            40, 41, 42, 43, 44, 45, 46, 47,
4200        );
4201        let r = _mm256_adds_epi16(a, b);
4202        #[rustfmt::skip]
4203        let e = _mm256_setr_epi16(
4204            32, 34, 36, 38, 40, 42, 44, 46,
4205            48, 50, 52, 54, 56, 58, 60, 62,
4206        );
4207
4208        assert_eq_m256i(r, e);
4209    }
4210
4211    #[simd_test(enable = "avx2")]
4212    fn test_mm256_adds_epi16_saturate_positive() {
4213        let a = _mm256_set1_epi16(0x7FFF);
4214        let b = _mm256_set1_epi16(1);
4215        let r = _mm256_adds_epi16(a, b);
4216        assert_eq_m256i(r, a);
4217    }
4218
4219    #[simd_test(enable = "avx2")]
4220    fn test_mm256_adds_epi16_saturate_negative() {
4221        let a = _mm256_set1_epi16(-0x8000);
4222        let b = _mm256_set1_epi16(-1);
4223        let r = _mm256_adds_epi16(a, b);
4224        assert_eq_m256i(r, a);
4225    }
4226
4227    #[simd_test(enable = "avx2")]
4228    const fn test_mm256_adds_epu8() {
4229        #[rustfmt::skip]
4230        let a = _mm256_setr_epi8(
4231            0, 1, 2, 3, 4, 5, 6, 7,
4232            8, 9, 10, 11, 12, 13, 14, 15,
4233            16, 17, 18, 19, 20, 21, 22, 23,
4234            24, 25, 26, 27, 28, 29, 30, 31,
4235        );
4236        #[rustfmt::skip]
4237        let b = _mm256_setr_epi8(
4238            32, 33, 34, 35, 36, 37, 38, 39,
4239            40, 41, 42, 43, 44, 45, 46, 47,
4240            48, 49, 50, 51, 52, 53, 54, 55,
4241            56, 57, 58, 59, 60, 61, 62, 63,
4242        );
4243        let r = _mm256_adds_epu8(a, b);
4244        #[rustfmt::skip]
4245        let e = _mm256_setr_epi8(
4246            32, 34, 36, 38, 40, 42, 44, 46,
4247            48, 50, 52, 54, 56, 58, 60, 62,
4248            64, 66, 68, 70, 72, 74, 76, 78,
4249            80, 82, 84, 86, 88, 90, 92, 94,
4250        );
4251        assert_eq_m256i(r, e);
4252    }
4253
4254    #[simd_test(enable = "avx2")]
4255    fn test_mm256_adds_epu8_saturate() {
4256        let a = _mm256_set1_epi8(!0);
4257        let b = _mm256_set1_epi8(1);
4258        let r = _mm256_adds_epu8(a, b);
4259        assert_eq_m256i(r, a);
4260    }
4261
4262    #[simd_test(enable = "avx2")]
4263    const fn test_mm256_adds_epu16() {
4264        #[rustfmt::skip]
4265        let a = _mm256_setr_epi16(
4266            0, 1, 2, 3, 4, 5, 6, 7,
4267            8, 9, 10, 11, 12, 13, 14, 15,
4268        );
4269        #[rustfmt::skip]
4270        let b = _mm256_setr_epi16(
4271            32, 33, 34, 35, 36, 37, 38, 39,
4272            40, 41, 42, 43, 44, 45, 46, 47,
4273        );
4274        let r = _mm256_adds_epu16(a, b);
4275        #[rustfmt::skip]
4276        let e = _mm256_setr_epi16(
4277            32, 34, 36, 38, 40, 42, 44, 46,
4278            48, 50, 52, 54, 56, 58, 60, 62,
4279        );
4280
4281        assert_eq_m256i(r, e);
4282    }
4283
4284    #[simd_test(enable = "avx2")]
4285    fn test_mm256_adds_epu16_saturate() {
4286        let a = _mm256_set1_epi16(!0);
4287        let b = _mm256_set1_epi16(1);
4288        let r = _mm256_adds_epu16(a, b);
4289        assert_eq_m256i(r, a);
4290    }
4291
4292    #[simd_test(enable = "avx2")]
4293    const fn test_mm256_and_si256() {
4294        let a = _mm256_set1_epi8(5);
4295        let b = _mm256_set1_epi8(3);
4296        let got = _mm256_and_si256(a, b);
4297        assert_eq_m256i(got, _mm256_set1_epi8(1));
4298    }
4299
4300    #[simd_test(enable = "avx2")]
4301    const fn test_mm256_andnot_si256() {
4302        let a = _mm256_set1_epi8(5);
4303        let b = _mm256_set1_epi8(3);
4304        let got = _mm256_andnot_si256(a, b);
4305        assert_eq_m256i(got, _mm256_set1_epi8(2));
4306    }
4307
4308    #[simd_test(enable = "avx2")]
4309    const fn test_mm256_avg_epu8() {
4310        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4311        let r = _mm256_avg_epu8(a, b);
4312        assert_eq_m256i(r, _mm256_set1_epi8(6));
4313    }
4314
4315    #[simd_test(enable = "avx2")]
4316    const fn test_mm256_avg_epu16() {
4317        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4318        let r = _mm256_avg_epu16(a, b);
4319        assert_eq_m256i(r, _mm256_set1_epi16(6));
4320    }
4321
4322    #[simd_test(enable = "avx2")]
4323    const fn test_mm_blend_epi32() {
4324        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4325        let e = _mm_setr_epi32(9, 3, 3, 3);
4326        let r = _mm_blend_epi32::<0x01>(a, b);
4327        assert_eq_m128i(r, e);
4328
4329        let r = _mm_blend_epi32::<0x0E>(b, a);
4330        assert_eq_m128i(r, e);
4331    }
4332
4333    #[simd_test(enable = "avx2")]
4334    const fn test_mm256_blend_epi32() {
4335        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4336        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4337        let r = _mm256_blend_epi32::<0x01>(a, b);
4338        assert_eq_m256i(r, e);
4339
4340        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4341        let r = _mm256_blend_epi32::<0x82>(a, b);
4342        assert_eq_m256i(r, e);
4343
4344        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4345        let r = _mm256_blend_epi32::<0x7C>(a, b);
4346        assert_eq_m256i(r, e);
4347    }
4348
4349    #[simd_test(enable = "avx2")]
4350    const fn test_mm256_blend_epi16() {
4351        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4352        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4353        let r = _mm256_blend_epi16::<0x01>(a, b);
4354        assert_eq_m256i(r, e);
4355
4356        let r = _mm256_blend_epi16::<0xFE>(b, a);
4357        assert_eq_m256i(r, e);
4358    }
4359
4360    #[simd_test(enable = "avx2")]
4361    const fn test_mm256_blendv_epi8() {
4362        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4363        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4364        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4365        let r = _mm256_blendv_epi8(a, b, mask);
4366        assert_eq_m256i(r, e);
4367    }
4368
4369    #[simd_test(enable = "avx2")]
4370    const fn test_mm_broadcastb_epi8() {
4371        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4372        let res = _mm_broadcastb_epi8(a);
4373        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4374    }
4375
4376    #[simd_test(enable = "avx2")]
4377    const fn test_mm256_broadcastb_epi8() {
4378        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4379        let res = _mm256_broadcastb_epi8(a);
4380        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4381    }
4382
4383    #[simd_test(enable = "avx2")]
4384    const fn test_mm_broadcastd_epi32() {
4385        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4386        let res = _mm_broadcastd_epi32(a);
4387        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4388    }
4389
4390    #[simd_test(enable = "avx2")]
4391    const fn test_mm256_broadcastd_epi32() {
4392        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4393        let res = _mm256_broadcastd_epi32(a);
4394        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4395    }
4396
4397    #[simd_test(enable = "avx2")]
4398    const fn test_mm_broadcastq_epi64() {
4399        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4400        let res = _mm_broadcastq_epi64(a);
4401        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4402    }
4403
4404    #[simd_test(enable = "avx2")]
4405    const fn test_mm256_broadcastq_epi64() {
4406        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4407        let res = _mm256_broadcastq_epi64(a);
4408        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4409    }
4410
4411    #[simd_test(enable = "avx2")]
4412    const fn test_mm_broadcastsd_pd() {
4413        let a = _mm_setr_pd(6.88, 3.44);
4414        let res = _mm_broadcastsd_pd(a);
4415        assert_eq_m128d(res, _mm_set1_pd(6.88));
4416    }
4417
4418    #[simd_test(enable = "avx2")]
4419    const fn test_mm256_broadcastsd_pd() {
4420        let a = _mm_setr_pd(6.88, 3.44);
4421        let res = _mm256_broadcastsd_pd(a);
4422        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4423    }
4424
4425    #[simd_test(enable = "avx2")]
4426    const fn test_mm_broadcastsi128_si256() {
4427        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4428        let res = _mm_broadcastsi128_si256(a);
4429        let retval = _mm256_setr_epi64x(
4430            0x0987654321012334,
4431            0x5678909876543210,
4432            0x0987654321012334,
4433            0x5678909876543210,
4434        );
4435        assert_eq_m256i(res, retval);
4436    }
4437
4438    #[simd_test(enable = "avx2")]
4439    const fn test_mm256_broadcastsi128_si256() {
4440        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4441        let res = _mm256_broadcastsi128_si256(a);
4442        let retval = _mm256_setr_epi64x(
4443            0x0987654321012334,
4444            0x5678909876543210,
4445            0x0987654321012334,
4446            0x5678909876543210,
4447        );
4448        assert_eq_m256i(res, retval);
4449    }
4450
4451    #[simd_test(enable = "avx2")]
4452    const fn test_mm_broadcastss_ps() {
4453        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4454        let res = _mm_broadcastss_ps(a);
4455        assert_eq_m128(res, _mm_set1_ps(6.88));
4456    }
4457
4458    #[simd_test(enable = "avx2")]
4459    const fn test_mm256_broadcastss_ps() {
4460        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4461        let res = _mm256_broadcastss_ps(a);
4462        assert_eq_m256(res, _mm256_set1_ps(6.88));
4463    }
4464
4465    #[simd_test(enable = "avx2")]
4466    const fn test_mm_broadcastw_epi16() {
4467        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4468        let res = _mm_broadcastw_epi16(a);
4469        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4470    }
4471
4472    #[simd_test(enable = "avx2")]
4473    const fn test_mm256_broadcastw_epi16() {
4474        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4475        let res = _mm256_broadcastw_epi16(a);
4476        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4477    }
4478
4479    #[simd_test(enable = "avx2")]
4480    const fn test_mm256_cmpeq_epi8() {
4481        #[rustfmt::skip]
4482        let a = _mm256_setr_epi8(
4483            0, 1, 2, 3, 4, 5, 6, 7,
4484            8, 9, 10, 11, 12, 13, 14, 15,
4485            16, 17, 18, 19, 20, 21, 22, 23,
4486            24, 25, 26, 27, 28, 29, 30, 31,
4487        );
4488        #[rustfmt::skip]
4489        let b = _mm256_setr_epi8(
4490            31, 30, 2, 28, 27, 26, 25, 24,
4491            23, 22, 21, 20, 19, 18, 17, 16,
4492            15, 14, 13, 12, 11, 10, 9, 8,
4493            7, 6, 5, 4, 3, 2, 1, 0,
4494        );
4495        let r = _mm256_cmpeq_epi8(a, b);
4496        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4497    }
4498
4499    #[simd_test(enable = "avx2")]
4500    const fn test_mm256_cmpeq_epi16() {
4501        #[rustfmt::skip]
4502        let a = _mm256_setr_epi16(
4503            0, 1, 2, 3, 4, 5, 6, 7,
4504            8, 9, 10, 11, 12, 13, 14, 15,
4505        );
4506        #[rustfmt::skip]
4507        let b = _mm256_setr_epi16(
4508            15, 14, 2, 12, 11, 10, 9, 8,
4509            7, 6, 5, 4, 3, 2, 1, 0,
4510        );
4511        let r = _mm256_cmpeq_epi16(a, b);
4512        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4513    }
4514
4515    #[simd_test(enable = "avx2")]
4516    const fn test_mm256_cmpeq_epi32() {
4517        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4518        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4519        let r = _mm256_cmpeq_epi32(a, b);
4520        let e = _mm256_set1_epi32(0);
4521        let e = _mm256_insert_epi32::<2>(e, !0);
4522        assert_eq_m256i(r, e);
4523    }
4524
4525    #[simd_test(enable = "avx2")]
4526    const fn test_mm256_cmpeq_epi64() {
4527        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4528        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4529        let r = _mm256_cmpeq_epi64(a, b);
4530        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4531    }
4532
4533    #[simd_test(enable = "avx2")]
4534    const fn test_mm256_cmpgt_epi8() {
4535        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4536        let b = _mm256_set1_epi8(0);
4537        let r = _mm256_cmpgt_epi8(a, b);
4538        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4539    }
4540
4541    #[simd_test(enable = "avx2")]
4542    const fn test_mm256_cmpgt_epi16() {
4543        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4544        let b = _mm256_set1_epi16(0);
4545        let r = _mm256_cmpgt_epi16(a, b);
4546        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4547    }
4548
4549    #[simd_test(enable = "avx2")]
4550    const fn test_mm256_cmpgt_epi32() {
4551        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4552        let b = _mm256_set1_epi32(0);
4553        let r = _mm256_cmpgt_epi32(a, b);
4554        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4555    }
4556
4557    #[simd_test(enable = "avx2")]
4558    const fn test_mm256_cmpgt_epi64() {
4559        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4560        let b = _mm256_set1_epi64x(0);
4561        let r = _mm256_cmpgt_epi64(a, b);
4562        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4563    }
4564
4565    #[simd_test(enable = "avx2")]
4566    const fn test_mm256_cvtepi8_epi16() {
4567        #[rustfmt::skip]
4568        let a = _mm_setr_epi8(
4569            0, 0, -1, 1, -2, 2, -3, 3,
4570            -4, 4, -5, 5, -6, 6, -7, 7,
4571        );
4572        #[rustfmt::skip]
4573        let r = _mm256_setr_epi16(
4574            0, 0, -1, 1, -2, 2, -3, 3,
4575            -4, 4, -5, 5, -6, 6, -7, 7,
4576        );
4577        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4578    }
4579
4580    #[simd_test(enable = "avx2")]
4581    const fn test_mm256_cvtepi8_epi32() {
4582        #[rustfmt::skip]
4583        let a = _mm_setr_epi8(
4584            0, 0, -1, 1, -2, 2, -3, 3,
4585            -4, 4, -5, 5, -6, 6, -7, 7,
4586        );
4587        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4588        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4589    }
4590
4591    #[simd_test(enable = "avx2")]
4592    const fn test_mm256_cvtepi8_epi64() {
4593        #[rustfmt::skip]
4594        let a = _mm_setr_epi8(
4595            0, 0, -1, 1, -2, 2, -3, 3,
4596            -4, 4, -5, 5, -6, 6, -7, 7,
4597        );
4598        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4599        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4600    }
4601
4602    #[simd_test(enable = "avx2")]
4603    const fn test_mm256_cvtepi16_epi32() {
4604        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4605        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4606        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4607    }
4608
4609    #[simd_test(enable = "avx2")]
4610    const fn test_mm256_cvtepi16_epi64() {
4611        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4612        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4613        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4614    }
4615
4616    #[simd_test(enable = "avx2")]
4617    const fn test_mm256_cvtepi32_epi64() {
4618        let a = _mm_setr_epi32(0, 0, -1, 1);
4619        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4620        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4621    }
4622
4623    #[simd_test(enable = "avx2")]
4624    const fn test_mm256_cvtepu16_epi32() {
4625        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4626        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4627        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4628    }
4629
4630    #[simd_test(enable = "avx2")]
4631    const fn test_mm256_cvtepu16_epi64() {
4632        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4633        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4634        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4635    }
4636
4637    #[simd_test(enable = "avx2")]
4638    const fn test_mm256_cvtepu32_epi64() {
4639        let a = _mm_setr_epi32(0, 1, 2, 3);
4640        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4641        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4642    }
4643
4644    #[simd_test(enable = "avx2")]
4645    const fn test_mm256_cvtepu8_epi16() {
4646        #[rustfmt::skip]
4647        let a = _mm_setr_epi8(
4648            0, 1, 2, 3, 4, 5, 6, 7,
4649            8, 9, 10, 11, 12, 13, 14, 15,
4650        );
4651        #[rustfmt::skip]
4652        let r = _mm256_setr_epi16(
4653            0, 1, 2, 3, 4, 5, 6, 7,
4654            8, 9, 10, 11, 12, 13, 14, 15,
4655        );
4656        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4657    }
4658
4659    #[simd_test(enable = "avx2")]
4660    const fn test_mm256_cvtepu8_epi32() {
4661        #[rustfmt::skip]
4662        let a = _mm_setr_epi8(
4663            0, 1, 2, 3, 4, 5, 6, 7,
4664            8, 9, 10, 11, 12, 13, 14, 15,
4665        );
4666        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4667        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4668    }
4669
4670    #[simd_test(enable = "avx2")]
4671    const fn test_mm256_cvtepu8_epi64() {
4672        #[rustfmt::skip]
4673        let a = _mm_setr_epi8(
4674            0, 1, 2, 3, 4, 5, 6, 7,
4675            8, 9, 10, 11, 12, 13, 14, 15,
4676        );
4677        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4678        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4679    }
4680
4681    #[simd_test(enable = "avx2")]
4682    const fn test_mm256_extracti128_si256() {
4683        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4684        let r = _mm256_extracti128_si256::<1>(a);
4685        let e = _mm_setr_epi64x(3, 4);
4686        assert_eq_m128i(r, e);
4687    }
4688
4689    #[simd_test(enable = "avx2")]
4690    const fn test_mm256_hadd_epi16() {
4691        let a = _mm256_set1_epi16(2);
4692        let b = _mm256_set1_epi16(4);
4693        let r = _mm256_hadd_epi16(a, b);
4694        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4695        assert_eq_m256i(r, e);
4696    }
4697
4698    #[simd_test(enable = "avx2")]
4699    const fn test_mm256_hadd_epi32() {
4700        let a = _mm256_set1_epi32(2);
4701        let b = _mm256_set1_epi32(4);
4702        let r = _mm256_hadd_epi32(a, b);
4703        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4704        assert_eq_m256i(r, e);
4705    }
4706
4707    #[simd_test(enable = "avx2")]
4708    fn test_mm256_hadds_epi16() {
4709        let a = _mm256_set1_epi16(2);
4710        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4711        let a = _mm256_insert_epi16::<1>(a, 1);
4712        let b = _mm256_set1_epi16(4);
4713        let r = _mm256_hadds_epi16(a, b);
4714        #[rustfmt::skip]
4715        let e = _mm256_setr_epi16(
4716            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4717            4, 4, 4, 4, 8, 8, 8, 8,
4718        );
4719        assert_eq_m256i(r, e);
4720    }
4721
4722    #[simd_test(enable = "avx2")]
4723    const fn test_mm256_hsub_epi16() {
4724        let a = _mm256_set1_epi16(2);
4725        let b = _mm256_set1_epi16(4);
4726        let r = _mm256_hsub_epi16(a, b);
4727        let e = _mm256_set1_epi16(0);
4728        assert_eq_m256i(r, e);
4729    }
4730
4731    #[simd_test(enable = "avx2")]
4732    const fn test_mm256_hsub_epi32() {
4733        let a = _mm256_set1_epi32(2);
4734        let b = _mm256_set1_epi32(4);
4735        let r = _mm256_hsub_epi32(a, b);
4736        let e = _mm256_set1_epi32(0);
4737        assert_eq_m256i(r, e);
4738    }
4739
4740    #[simd_test(enable = "avx2")]
4741    fn test_mm256_hsubs_epi16() {
4742        let a = _mm256_set1_epi16(2);
4743        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4744        let a = _mm256_insert_epi16::<1>(a, -1);
4745        let b = _mm256_set1_epi16(4);
4746        let r = _mm256_hsubs_epi16(a, b);
4747        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4748        assert_eq_m256i(r, e);
4749    }
4750
4751    #[simd_test(enable = "avx2")]
4752    fn test_mm256_madd_epi16() {
4753        let a = _mm256_set1_epi16(2);
4754        let b = _mm256_set1_epi16(4);
4755        let r = _mm256_madd_epi16(a, b);
4756        let e = _mm256_set1_epi32(16);
4757        assert_eq_m256i(r, e);
4758    }
4759
4760    #[target_feature(enable = "avx2")]
4761    #[cfg_attr(test, assert_instr(vpmaddwd))]
4762    unsafe fn test_mm256_madd_epi16_mul_one(v: __m256i) -> __m256i {
4763        // This is a trick used in the adler32 algorithm to get a widening addition. The
4764        // multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd
4765        // instruction is no longer selected. The assert_instr verifies that this is the case.
4766        let one_v = _mm256_set1_epi16(1);
4767        _mm256_madd_epi16(v, one_v)
4768    }
4769
4770    #[target_feature(enable = "avx2")]
4771    #[cfg_attr(test, assert_instr(vpmaddwd))]
4772    unsafe fn test_mm256_madd_epi16_shl(v: __m256i) -> __m256i {
4773        // This is a trick used in the base64 algorithm to get a widening addition. Instead of a
4774        // multiplication, a vector shl is used. In LLVM 22 that breaks the pattern recognition
4775        // for the automatic optimization to vpmaddwd.
4776        let shift_value = _mm256_set1_epi32(12i32);
4777        _mm256_madd_epi16(v, shift_value)
4778    }
4779
4780    #[simd_test(enable = "avx2")]
4781    const fn test_mm256_inserti128_si256() {
4782        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4783        let b = _mm_setr_epi64x(7, 8);
4784        let r = _mm256_inserti128_si256::<1>(a, b);
4785        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4786        assert_eq_m256i(r, e);
4787    }
4788
4789    #[simd_test(enable = "avx2")]
4790    fn test_mm256_maddubs_epi16() {
4791        let a = _mm256_set1_epi8(2);
4792        let b = _mm256_set1_epi8(4);
4793        let r = _mm256_maddubs_epi16(a, b);
4794        let e = _mm256_set1_epi16(16);
4795        assert_eq_m256i(r, e);
4796    }
4797
4798    #[simd_test(enable = "avx2")]
4799    const fn test_mm_maskload_epi32() {
4800        let nums = [1, 2, 3, 4];
4801        let a = &nums as *const i32;
4802        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4803        let r = unsafe { _mm_maskload_epi32(a, mask) };
4804        let e = _mm_setr_epi32(1, 0, 0, 4);
4805        assert_eq_m128i(r, e);
4806    }
4807
4808    #[simd_test(enable = "avx2")]
4809    const fn test_mm256_maskload_epi32() {
4810        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4811        let a = &nums as *const i32;
4812        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4813        let r = unsafe { _mm256_maskload_epi32(a, mask) };
4814        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4815        assert_eq_m256i(r, e);
4816    }
4817
4818    #[simd_test(enable = "avx2")]
4819    const fn test_mm_maskload_epi64() {
4820        let nums = [1_i64, 2_i64];
4821        let a = &nums as *const i64;
4822        let mask = _mm_setr_epi64x(0, -1);
4823        let r = unsafe { _mm_maskload_epi64(a, mask) };
4824        let e = _mm_setr_epi64x(0, 2);
4825        assert_eq_m128i(r, e);
4826    }
4827
4828    #[simd_test(enable = "avx2")]
4829    const fn test_mm256_maskload_epi64() {
4830        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4831        let a = &nums as *const i64;
4832        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4833        let r = unsafe { _mm256_maskload_epi64(a, mask) };
4834        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4835        assert_eq_m256i(r, e);
4836    }
4837
4838    #[simd_test(enable = "avx2")]
4839    const fn test_mm_maskstore_epi32() {
4840        let a = _mm_setr_epi32(1, 2, 3, 4);
4841        let mut arr = [-1, -1, -1, -1];
4842        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4843        unsafe {
4844            _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4845        }
4846        let e = [1, -1, -1, 4];
4847        assert_eq!(arr, e);
4848    }
4849
4850    #[simd_test(enable = "avx2")]
4851    const fn test_mm256_maskstore_epi32() {
4852        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4853        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4854        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4855        unsafe {
4856            _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4857        }
4858        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4859        assert_eq!(arr, e);
4860    }
4861
4862    #[simd_test(enable = "avx2")]
4863    const fn test_mm_maskstore_epi64() {
4864        let a = _mm_setr_epi64x(1_i64, 2_i64);
4865        let mut arr = [-1_i64, -1_i64];
4866        let mask = _mm_setr_epi64x(0, -1);
4867        unsafe {
4868            _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4869        }
4870        let e = [-1, 2];
4871        assert_eq!(arr, e);
4872    }
4873
4874    #[simd_test(enable = "avx2")]
4875    const fn test_mm256_maskstore_epi64() {
4876        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4877        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4878        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4879        unsafe {
4880            _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4881        }
4882        let e = [-1, 2, 3, -1];
4883        assert_eq!(arr, e);
4884    }
4885
4886    #[simd_test(enable = "avx2")]
4887    const fn test_mm256_max_epi16() {
4888        let a = _mm256_set1_epi16(2);
4889        let b = _mm256_set1_epi16(4);
4890        let r = _mm256_max_epi16(a, b);
4891        assert_eq_m256i(r, b);
4892    }
4893
4894    #[simd_test(enable = "avx2")]
4895    const fn test_mm256_max_epi32() {
4896        let a = _mm256_set1_epi32(2);
4897        let b = _mm256_set1_epi32(4);
4898        let r = _mm256_max_epi32(a, b);
4899        assert_eq_m256i(r, b);
4900    }
4901
4902    #[simd_test(enable = "avx2")]
4903    const fn test_mm256_max_epi8() {
4904        let a = _mm256_set1_epi8(2);
4905        let b = _mm256_set1_epi8(4);
4906        let r = _mm256_max_epi8(a, b);
4907        assert_eq_m256i(r, b);
4908    }
4909
4910    #[simd_test(enable = "avx2")]
4911    const fn test_mm256_max_epu16() {
4912        let a = _mm256_set1_epi16(2);
4913        let b = _mm256_set1_epi16(4);
4914        let r = _mm256_max_epu16(a, b);
4915        assert_eq_m256i(r, b);
4916    }
4917
4918    #[simd_test(enable = "avx2")]
4919    const fn test_mm256_max_epu32() {
4920        let a = _mm256_set1_epi32(2);
4921        let b = _mm256_set1_epi32(4);
4922        let r = _mm256_max_epu32(a, b);
4923        assert_eq_m256i(r, b);
4924    }
4925
4926    #[simd_test(enable = "avx2")]
4927    const fn test_mm256_max_epu8() {
4928        let a = _mm256_set1_epi8(2);
4929        let b = _mm256_set1_epi8(4);
4930        let r = _mm256_max_epu8(a, b);
4931        assert_eq_m256i(r, b);
4932    }
4933
4934    #[simd_test(enable = "avx2")]
4935    const fn test_mm256_min_epi16() {
4936        let a = _mm256_set1_epi16(2);
4937        let b = _mm256_set1_epi16(4);
4938        let r = _mm256_min_epi16(a, b);
4939        assert_eq_m256i(r, a);
4940    }
4941
4942    #[simd_test(enable = "avx2")]
4943    const fn test_mm256_min_epi32() {
4944        let a = _mm256_set1_epi32(2);
4945        let b = _mm256_set1_epi32(4);
4946        let r = _mm256_min_epi32(a, b);
4947        assert_eq_m256i(r, a);
4948    }
4949
4950    #[simd_test(enable = "avx2")]
4951    const fn test_mm256_min_epi8() {
4952        let a = _mm256_set1_epi8(2);
4953        let b = _mm256_set1_epi8(4);
4954        let r = _mm256_min_epi8(a, b);
4955        assert_eq_m256i(r, a);
4956    }
4957
4958    #[simd_test(enable = "avx2")]
4959    const fn test_mm256_min_epu16() {
4960        let a = _mm256_set1_epi16(2);
4961        let b = _mm256_set1_epi16(4);
4962        let r = _mm256_min_epu16(a, b);
4963        assert_eq_m256i(r, a);
4964    }
4965
4966    #[simd_test(enable = "avx2")]
4967    const fn test_mm256_min_epu32() {
4968        let a = _mm256_set1_epi32(2);
4969        let b = _mm256_set1_epi32(4);
4970        let r = _mm256_min_epu32(a, b);
4971        assert_eq_m256i(r, a);
4972    }
4973
4974    #[simd_test(enable = "avx2")]
4975    const fn test_mm256_min_epu8() {
4976        let a = _mm256_set1_epi8(2);
4977        let b = _mm256_set1_epi8(4);
4978        let r = _mm256_min_epu8(a, b);
4979        assert_eq_m256i(r, a);
4980    }
4981
4982    #[simd_test(enable = "avx2")]
4983    const fn test_mm256_movemask_epi8() {
4984        let a = _mm256_set1_epi8(-1);
4985        let r = _mm256_movemask_epi8(a);
4986        let e = -1;
4987        assert_eq!(r, e);
4988    }
4989
4990    #[simd_test(enable = "avx2")]
4991    fn test_mm256_mpsadbw_epu8() {
4992        let a = _mm256_set1_epi8(2);
4993        let b = _mm256_set1_epi8(4);
4994        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4995        let e = _mm256_set1_epi16(8);
4996        assert_eq_m256i(r, e);
4997    }
4998
4999    #[simd_test(enable = "avx2")]
5000    const fn test_mm256_mul_epi32() {
5001        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5002        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5003        let r = _mm256_mul_epi32(a, b);
5004        let e = _mm256_setr_epi64x(0, 0, 10, 14);
5005        assert_eq_m256i(r, e);
5006    }
5007
5008    #[simd_test(enable = "avx2")]
5009    const fn test_mm256_mul_epu32() {
5010        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5011        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5012        let r = _mm256_mul_epu32(a, b);
5013        let e = _mm256_setr_epi64x(0, 0, 10, 14);
5014        assert_eq_m256i(r, e);
5015    }
5016
5017    #[simd_test(enable = "avx2")]
5018    const fn test_mm256_mulhi_epi16() {
5019        let a = _mm256_set1_epi16(6535);
5020        let b = _mm256_set1_epi16(6535);
5021        let r = _mm256_mulhi_epi16(a, b);
5022        let e = _mm256_set1_epi16(651);
5023        assert_eq_m256i(r, e);
5024    }
5025
5026    #[simd_test(enable = "avx2")]
5027    const fn test_mm256_mulhi_epu16() {
5028        let a = _mm256_set1_epi16(6535);
5029        let b = _mm256_set1_epi16(6535);
5030        let r = _mm256_mulhi_epu16(a, b);
5031        let e = _mm256_set1_epi16(651);
5032        assert_eq_m256i(r, e);
5033    }
5034
5035    #[simd_test(enable = "avx2")]
5036    const fn test_mm256_mullo_epi16() {
5037        let a = _mm256_set1_epi16(2);
5038        let b = _mm256_set1_epi16(4);
5039        let r = _mm256_mullo_epi16(a, b);
5040        let e = _mm256_set1_epi16(8);
5041        assert_eq_m256i(r, e);
5042    }
5043
5044    #[simd_test(enable = "avx2")]
5045    const fn test_mm256_mullo_epi32() {
5046        let a = _mm256_set1_epi32(2);
5047        let b = _mm256_set1_epi32(4);
5048        let r = _mm256_mullo_epi32(a, b);
5049        let e = _mm256_set1_epi32(8);
5050        assert_eq_m256i(r, e);
5051    }
5052
5053    #[simd_test(enable = "avx2")]
5054    fn test_mm256_mulhrs_epi16() {
5055        let a = _mm256_set1_epi16(2);
5056        let b = _mm256_set1_epi16(4);
5057        let r = _mm256_mullo_epi16(a, b);
5058        let e = _mm256_set1_epi16(8);
5059        assert_eq_m256i(r, e);
5060    }
5061
5062    #[simd_test(enable = "avx2")]
5063    const fn test_mm256_or_si256() {
5064        let a = _mm256_set1_epi8(-1);
5065        let b = _mm256_set1_epi8(0);
5066        let r = _mm256_or_si256(a, b);
5067        assert_eq_m256i(r, a);
5068    }
5069
5070    #[simd_test(enable = "avx2")]
5071    const fn test_mm256_packs_epi16() {
5072        let a = _mm256_set1_epi16(2);
5073        let b = _mm256_set1_epi16(4);
5074        let r = _mm256_packs_epi16(a, b);
5075        #[rustfmt::skip]
5076        let e = _mm256_setr_epi8(
5077            2, 2, 2, 2, 2, 2, 2, 2,
5078            4, 4, 4, 4, 4, 4, 4, 4,
5079            2, 2, 2, 2, 2, 2, 2, 2,
5080            4, 4, 4, 4, 4, 4, 4, 4,
5081        );
5082
5083        assert_eq_m256i(r, e);
5084    }
5085
5086    #[simd_test(enable = "avx2")]
5087    const fn test_mm256_packs_epi32() {
5088        let a = _mm256_set1_epi32(2);
5089        let b = _mm256_set1_epi32(4);
5090        let r = _mm256_packs_epi32(a, b);
5091        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5092
5093        assert_eq_m256i(r, e);
5094    }
5095
5096    #[simd_test(enable = "avx2")]
5097    const fn test_mm256_packus_epi16() {
5098        let a = _mm256_set1_epi16(2);
5099        let b = _mm256_set1_epi16(4);
5100        let r = _mm256_packus_epi16(a, b);
5101        #[rustfmt::skip]
5102        let e = _mm256_setr_epi8(
5103            2, 2, 2, 2, 2, 2, 2, 2,
5104            4, 4, 4, 4, 4, 4, 4, 4,
5105            2, 2, 2, 2, 2, 2, 2, 2,
5106            4, 4, 4, 4, 4, 4, 4, 4,
5107        );
5108
5109        assert_eq_m256i(r, e);
5110    }
5111
5112    #[simd_test(enable = "avx2")]
5113    const fn test_mm256_packus_epi32() {
5114        let a = _mm256_set1_epi32(2);
5115        let b = _mm256_set1_epi32(4);
5116        let r = _mm256_packus_epi32(a, b);
5117        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5118
5119        assert_eq_m256i(r, e);
5120    }
5121
5122    #[simd_test(enable = "avx2")]
5123    fn test_mm256_sad_epu8() {
5124        let a = _mm256_set1_epi8(2);
5125        let b = _mm256_set1_epi8(4);
5126        let r = _mm256_sad_epu8(a, b);
5127        let e = _mm256_set1_epi64x(16);
5128        assert_eq_m256i(r, e);
5129    }
5130
5131    #[simd_test(enable = "avx2")]
5132    const fn test_mm256_shufflehi_epi16() {
5133        #[rustfmt::skip]
5134        let a = _mm256_setr_epi16(
5135            0, 1, 2, 3, 11, 22, 33, 44,
5136            4, 5, 6, 7, 55, 66, 77, 88,
5137        );
5138        #[rustfmt::skip]
5139        let e = _mm256_setr_epi16(
5140            0, 1, 2, 3, 44, 22, 22, 11,
5141            4, 5, 6, 7, 88, 66, 66, 55,
5142        );
5143        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5144        assert_eq_m256i(r, e);
5145    }
5146
5147    #[simd_test(enable = "avx2")]
5148    const fn test_mm256_shufflelo_epi16() {
5149        #[rustfmt::skip]
5150        let a = _mm256_setr_epi16(
5151            11, 22, 33, 44, 0, 1, 2, 3,
5152            55, 66, 77, 88, 4, 5, 6, 7,
5153        );
5154        #[rustfmt::skip]
5155        let e = _mm256_setr_epi16(
5156            44, 22, 22, 11, 0, 1, 2, 3,
5157            88, 66, 66, 55, 4, 5, 6, 7,
5158        );
5159        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5160        assert_eq_m256i(r, e);
5161    }
5162
5163    #[simd_test(enable = "avx2")]
5164    fn test_mm256_sign_epi16() {
5165        let a = _mm256_set1_epi16(2);
5166        let b = _mm256_set1_epi16(-1);
5167        let r = _mm256_sign_epi16(a, b);
5168        let e = _mm256_set1_epi16(-2);
5169        assert_eq_m256i(r, e);
5170    }
5171
5172    #[simd_test(enable = "avx2")]
5173    fn test_mm256_sign_epi32() {
5174        let a = _mm256_set1_epi32(2);
5175        let b = _mm256_set1_epi32(-1);
5176        let r = _mm256_sign_epi32(a, b);
5177        let e = _mm256_set1_epi32(-2);
5178        assert_eq_m256i(r, e);
5179    }
5180
5181    #[simd_test(enable = "avx2")]
5182    fn test_mm256_sign_epi8() {
5183        let a = _mm256_set1_epi8(2);
5184        let b = _mm256_set1_epi8(-1);
5185        let r = _mm256_sign_epi8(a, b);
5186        let e = _mm256_set1_epi8(-2);
5187        assert_eq_m256i(r, e);
5188    }
5189
5190    #[simd_test(enable = "avx2")]
5191    fn test_mm256_sll_epi16() {
5192        let a = _mm256_set1_epi16(0xFF);
5193        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5194        let r = _mm256_sll_epi16(a, b);
5195        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5196    }
5197
5198    #[simd_test(enable = "avx2")]
5199    fn test_mm256_sll_epi32() {
5200        let a = _mm256_set1_epi32(0xFFFF);
5201        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5202        let r = _mm256_sll_epi32(a, b);
5203        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5204    }
5205
5206    #[simd_test(enable = "avx2")]
5207    fn test_mm256_sll_epi64() {
5208        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5209        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5210        let r = _mm256_sll_epi64(a, b);
5211        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5212    }
5213
5214    #[simd_test(enable = "avx2")]
5215    const fn test_mm256_slli_epi16() {
5216        assert_eq_m256i(
5217            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5218            _mm256_set1_epi16(0xFF0),
5219        );
5220    }
5221
5222    #[simd_test(enable = "avx2")]
5223    const fn test_mm256_slli_epi32() {
5224        assert_eq_m256i(
5225            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5226            _mm256_set1_epi32(0xFFFF0),
5227        );
5228    }
5229
5230    #[simd_test(enable = "avx2")]
5231    const fn test_mm256_slli_epi64() {
5232        assert_eq_m256i(
5233            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5234            _mm256_set1_epi64x(0xFFFFFFFF0),
5235        );
5236    }
5237
5238    #[simd_test(enable = "avx2")]
5239    const fn test_mm256_slli_si256() {
5240        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5241        let r = _mm256_slli_si256::<3>(a);
5242        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5243    }
5244
5245    #[simd_test(enable = "avx2")]
5246    const fn test_mm_sllv_epi32() {
5247        let a = _mm_set1_epi32(2);
5248        let b = _mm_set1_epi32(1);
5249        let r = _mm_sllv_epi32(a, b);
5250        let e = _mm_set1_epi32(4);
5251        assert_eq_m128i(r, e);
5252    }
5253
5254    #[simd_test(enable = "avx2")]
5255    const fn test_mm256_sllv_epi32() {
5256        let a = _mm256_set1_epi32(2);
5257        let b = _mm256_set1_epi32(1);
5258        let r = _mm256_sllv_epi32(a, b);
5259        let e = _mm256_set1_epi32(4);
5260        assert_eq_m256i(r, e);
5261    }
5262
5263    #[simd_test(enable = "avx2")]
5264    const fn test_mm_sllv_epi64() {
5265        let a = _mm_set1_epi64x(2);
5266        let b = _mm_set1_epi64x(1);
5267        let r = _mm_sllv_epi64(a, b);
5268        let e = _mm_set1_epi64x(4);
5269        assert_eq_m128i(r, e);
5270    }
5271
5272    #[simd_test(enable = "avx2")]
5273    const fn test_mm256_sllv_epi64() {
5274        let a = _mm256_set1_epi64x(2);
5275        let b = _mm256_set1_epi64x(1);
5276        let r = _mm256_sllv_epi64(a, b);
5277        let e = _mm256_set1_epi64x(4);
5278        assert_eq_m256i(r, e);
5279    }
5280
5281    #[simd_test(enable = "avx2")]
5282    fn test_mm256_sra_epi16() {
5283        let a = _mm256_set1_epi16(-1);
5284        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5285        let r = _mm256_sra_epi16(a, b);
5286        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5287    }
5288
5289    #[simd_test(enable = "avx2")]
5290    fn test_mm256_sra_epi32() {
5291        let a = _mm256_set1_epi32(-1);
5292        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5293        let r = _mm256_sra_epi32(a, b);
5294        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5295    }
5296
5297    #[simd_test(enable = "avx2")]
5298    const fn test_mm256_srai_epi16() {
5299        assert_eq_m256i(
5300            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5301            _mm256_set1_epi16(-1),
5302        );
5303    }
5304
5305    #[simd_test(enable = "avx2")]
5306    const fn test_mm256_srai_epi32() {
5307        assert_eq_m256i(
5308            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5309            _mm256_set1_epi32(-1),
5310        );
5311    }
5312
5313    #[simd_test(enable = "avx2")]
5314    const fn test_mm_srav_epi32() {
5315        let a = _mm_set1_epi32(4);
5316        let count = _mm_set1_epi32(1);
5317        let r = _mm_srav_epi32(a, count);
5318        let e = _mm_set1_epi32(2);
5319        assert_eq_m128i(r, e);
5320    }
5321
5322    #[simd_test(enable = "avx2")]
5323    const fn test_mm256_srav_epi32() {
5324        let a = _mm256_set1_epi32(4);
5325        let count = _mm256_set1_epi32(1);
5326        let r = _mm256_srav_epi32(a, count);
5327        let e = _mm256_set1_epi32(2);
5328        assert_eq_m256i(r, e);
5329    }
5330
5331    #[simd_test(enable = "avx2")]
5332    const fn test_mm256_srli_si256() {
5333        #[rustfmt::skip]
5334        let a = _mm256_setr_epi8(
5335            1, 2, 3, 4, 5, 6, 7, 8,
5336            9, 10, 11, 12, 13, 14, 15, 16,
5337            17, 18, 19, 20, 21, 22, 23, 24,
5338            25, 26, 27, 28, 29, 30, 31, 32,
5339        );
5340        let r = _mm256_srli_si256::<3>(a);
5341        #[rustfmt::skip]
5342        let e = _mm256_setr_epi8(
5343            4, 5, 6, 7, 8, 9, 10, 11,
5344            12, 13, 14, 15, 16, 0, 0, 0,
5345            20, 21, 22, 23, 24, 25, 26, 27,
5346            28, 29, 30, 31, 32, 0, 0, 0,
5347        );
5348        assert_eq_m256i(r, e);
5349    }
5350
5351    #[simd_test(enable = "avx2")]
5352    fn test_mm256_srl_epi16() {
5353        let a = _mm256_set1_epi16(0xFF);
5354        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5355        let r = _mm256_srl_epi16(a, b);
5356        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5357    }
5358
5359    #[simd_test(enable = "avx2")]
5360    fn test_mm256_srl_epi32() {
5361        let a = _mm256_set1_epi32(0xFFFF);
5362        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5363        let r = _mm256_srl_epi32(a, b);
5364        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5365    }
5366
5367    #[simd_test(enable = "avx2")]
5368    fn test_mm256_srl_epi64() {
5369        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5370        let b = _mm_setr_epi64x(4, 0);
5371        let r = _mm256_srl_epi64(a, b);
5372        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5373    }
5374
5375    #[simd_test(enable = "avx2")]
5376    const fn test_mm256_srli_epi16() {
5377        assert_eq_m256i(
5378            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5379            _mm256_set1_epi16(0xF),
5380        );
5381    }
5382
5383    #[simd_test(enable = "avx2")]
5384    const fn test_mm256_srli_epi32() {
5385        assert_eq_m256i(
5386            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5387            _mm256_set1_epi32(0xFFF),
5388        );
5389    }
5390
5391    #[simd_test(enable = "avx2")]
5392    const fn test_mm256_srli_epi64() {
5393        assert_eq_m256i(
5394            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5395            _mm256_set1_epi64x(0xFFFFFFF),
5396        );
5397    }
5398
5399    #[simd_test(enable = "avx2")]
5400    const fn test_mm_srlv_epi32() {
5401        let a = _mm_set1_epi32(2);
5402        let count = _mm_set1_epi32(1);
5403        let r = _mm_srlv_epi32(a, count);
5404        let e = _mm_set1_epi32(1);
5405        assert_eq_m128i(r, e);
5406    }
5407
5408    #[simd_test(enable = "avx2")]
5409    const fn test_mm256_srlv_epi32() {
5410        let a = _mm256_set1_epi32(2);
5411        let count = _mm256_set1_epi32(1);
5412        let r = _mm256_srlv_epi32(a, count);
5413        let e = _mm256_set1_epi32(1);
5414        assert_eq_m256i(r, e);
5415    }
5416
5417    #[simd_test(enable = "avx2")]
5418    const fn test_mm_srlv_epi64() {
5419        let a = _mm_set1_epi64x(2);
5420        let count = _mm_set1_epi64x(1);
5421        let r = _mm_srlv_epi64(a, count);
5422        let e = _mm_set1_epi64x(1);
5423        assert_eq_m128i(r, e);
5424    }
5425
5426    #[simd_test(enable = "avx2")]
5427    const fn test_mm256_srlv_epi64() {
5428        let a = _mm256_set1_epi64x(2);
5429        let count = _mm256_set1_epi64x(1);
5430        let r = _mm256_srlv_epi64(a, count);
5431        let e = _mm256_set1_epi64x(1);
5432        assert_eq_m256i(r, e);
5433    }
5434
5435    #[simd_test(enable = "avx2")]
5436    fn test_mm256_stream_load_si256() {
5437        let a = _mm256_set_epi64x(5, 6, 7, 8);
5438        let r = unsafe { _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _) };
5439        assert_eq_m256i(a, r);
5440    }
5441
5442    #[simd_test(enable = "avx2")]
5443    const fn test_mm256_sub_epi16() {
5444        let a = _mm256_set1_epi16(4);
5445        let b = _mm256_set1_epi16(2);
5446        let r = _mm256_sub_epi16(a, b);
5447        assert_eq_m256i(r, b);
5448    }
5449
5450    #[simd_test(enable = "avx2")]
5451    const fn test_mm256_sub_epi32() {
5452        let a = _mm256_set1_epi32(4);
5453        let b = _mm256_set1_epi32(2);
5454        let r = _mm256_sub_epi32(a, b);
5455        assert_eq_m256i(r, b);
5456    }
5457
5458    #[simd_test(enable = "avx2")]
5459    const fn test_mm256_sub_epi64() {
5460        let a = _mm256_set1_epi64x(4);
5461        let b = _mm256_set1_epi64x(2);
5462        let r = _mm256_sub_epi64(a, b);
5463        assert_eq_m256i(r, b);
5464    }
5465
5466    #[simd_test(enable = "avx2")]
5467    const fn test_mm256_sub_epi8() {
5468        let a = _mm256_set1_epi8(4);
5469        let b = _mm256_set1_epi8(2);
5470        let r = _mm256_sub_epi8(a, b);
5471        assert_eq_m256i(r, b);
5472    }
5473
5474    #[simd_test(enable = "avx2")]
5475    const fn test_mm256_subs_epi16() {
5476        let a = _mm256_set1_epi16(4);
5477        let b = _mm256_set1_epi16(2);
5478        let r = _mm256_subs_epi16(a, b);
5479        assert_eq_m256i(r, b);
5480    }
5481
5482    #[simd_test(enable = "avx2")]
5483    const fn test_mm256_subs_epi8() {
5484        let a = _mm256_set1_epi8(4);
5485        let b = _mm256_set1_epi8(2);
5486        let r = _mm256_subs_epi8(a, b);
5487        assert_eq_m256i(r, b);
5488    }
5489
5490    #[simd_test(enable = "avx2")]
5491    const fn test_mm256_subs_epu16() {
5492        let a = _mm256_set1_epi16(4);
5493        let b = _mm256_set1_epi16(2);
5494        let r = _mm256_subs_epu16(a, b);
5495        assert_eq_m256i(r, b);
5496    }
5497
5498    #[simd_test(enable = "avx2")]
5499    const fn test_mm256_subs_epu8() {
5500        let a = _mm256_set1_epi8(4);
5501        let b = _mm256_set1_epi8(2);
5502        let r = _mm256_subs_epu8(a, b);
5503        assert_eq_m256i(r, b);
5504    }
5505
5506    #[simd_test(enable = "avx2")]
5507    const fn test_mm256_xor_si256() {
5508        let a = _mm256_set1_epi8(5);
5509        let b = _mm256_set1_epi8(3);
5510        let r = _mm256_xor_si256(a, b);
5511        assert_eq_m256i(r, _mm256_set1_epi8(6));
5512    }
5513
5514    #[simd_test(enable = "avx2")]
5515    const fn test_mm256_alignr_epi8() {
5516        #[rustfmt::skip]
5517        let a = _mm256_setr_epi8(
5518            1, 2, 3, 4, 5, 6, 7, 8,
5519            9, 10, 11, 12, 13, 14, 15, 16,
5520            17, 18, 19, 20, 21, 22, 23, 24,
5521            25, 26, 27, 28, 29, 30, 31, 32,
5522        );
5523        #[rustfmt::skip]
5524        let b = _mm256_setr_epi8(
5525            -1, -2, -3, -4, -5, -6, -7, -8,
5526            -9, -10, -11, -12, -13, -14, -15, -16,
5527            -17, -18, -19, -20, -21, -22, -23, -24,
5528            -25, -26, -27, -28, -29, -30, -31, -32,
5529        );
5530        let r = _mm256_alignr_epi8::<33>(a, b);
5531        assert_eq_m256i(r, _mm256_set1_epi8(0));
5532
5533        let r = _mm256_alignr_epi8::<17>(a, b);
5534        #[rustfmt::skip]
5535        let expected = _mm256_setr_epi8(
5536            2, 3, 4, 5, 6, 7, 8, 9,
5537            10, 11, 12, 13, 14, 15, 16, 0,
5538            18, 19, 20, 21, 22, 23, 24, 25,
5539            26, 27, 28, 29, 30, 31, 32, 0,
5540        );
5541        assert_eq_m256i(r, expected);
5542
5543        let r = _mm256_alignr_epi8::<4>(a, b);
5544        #[rustfmt::skip]
5545        let expected = _mm256_setr_epi8(
5546            -5, -6, -7, -8, -9, -10, -11, -12,
5547            -13, -14, -15, -16, 1, 2, 3, 4,
5548            -21, -22, -23, -24, -25, -26, -27, -28,
5549            -29, -30, -31, -32, 17, 18, 19, 20,
5550        );
5551        assert_eq_m256i(r, expected);
5552
5553        let r = _mm256_alignr_epi8::<15>(a, b);
5554        #[rustfmt::skip]
5555        let expected = _mm256_setr_epi8(
5556            -16, 1, 2, 3, 4, 5, 6, 7,
5557            8, 9, 10, 11, 12, 13, 14, 15,
5558            -32, 17, 18, 19, 20, 21, 22, 23,
5559            24, 25, 26, 27, 28, 29, 30, 31,
5560        );
5561        assert_eq_m256i(r, expected);
5562
5563        let r = _mm256_alignr_epi8::<0>(a, b);
5564        assert_eq_m256i(r, b);
5565
5566        let r = _mm256_alignr_epi8::<16>(a, b);
5567        assert_eq_m256i(r, a);
5568    }
5569
5570    #[simd_test(enable = "avx2")]
5571    fn test_mm256_shuffle_epi8() {
5572        #[rustfmt::skip]
5573        let a = _mm256_setr_epi8(
5574            1, 2, 3, 4, 5, 6, 7, 8,
5575            9, 10, 11, 12, 13, 14, 15, 16,
5576            17, 18, 19, 20, 21, 22, 23, 24,
5577            25, 26, 27, 28, 29, 30, 31, 32,
5578        );
5579        #[rustfmt::skip]
5580        let b = _mm256_setr_epi8(
5581            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5582            12, 5, 5, 10, 4, 1, 8, 0,
5583            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5584            12, 5, 5, 10, 4, 1, 8, 0,
5585        );
5586        #[rustfmt::skip]
5587        let expected = _mm256_setr_epi8(
5588            5, 0, 5, 4, 9, 13, 7, 4,
5589            13, 6, 6, 11, 5, 2, 9, 1,
5590            21, 0, 21, 20, 25, 29, 23, 20,
5591            29, 22, 22, 27, 21, 18, 25, 17,
5592        );
5593        let r = _mm256_shuffle_epi8(a, b);
5594        assert_eq_m256i(r, expected);
5595    }
5596
5597    #[simd_test(enable = "avx2")]
5598    fn test_mm256_permutevar8x32_epi32() {
5599        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5600        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5601        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5602        let r = _mm256_permutevar8x32_epi32(a, b);
5603        assert_eq_m256i(r, expected);
5604    }
5605
5606    #[simd_test(enable = "avx2")]
5607    const fn test_mm256_permute4x64_epi64() {
5608        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5609        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5610        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5611        assert_eq_m256i(r, expected);
5612    }
5613
5614    #[simd_test(enable = "avx2")]
5615    const fn test_mm256_permute2x128_si256() {
5616        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5617        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5618        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5619        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5620        assert_eq_m256i(r, e);
5621    }
5622
5623    #[simd_test(enable = "avx2")]
5624    const fn test_mm256_permute4x64_pd() {
5625        let a = _mm256_setr_pd(1., 2., 3., 4.);
5626        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5627        let e = _mm256_setr_pd(4., 1., 2., 1.);
5628        assert_eq_m256d(r, e);
5629    }
5630
5631    #[simd_test(enable = "avx2")]
5632    fn test_mm256_permutevar8x32_ps() {
5633        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5634        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5635        let r = _mm256_permutevar8x32_ps(a, b);
5636        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5637        assert_eq_m256(r, e);
5638    }
5639
5640    #[simd_test(enable = "avx2")]
5641    fn test_mm_i32gather_epi32() {
5642        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5643        // A multiplier of 4 is word-addressing
5644        let r = unsafe { _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5645        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5646    }
5647
5648    #[simd_test(enable = "avx2")]
5649    fn test_mm_mask_i32gather_epi32() {
5650        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5651        // A multiplier of 4 is word-addressing
5652        let r = unsafe {
5653            _mm_mask_i32gather_epi32::<4>(
5654                _mm_set1_epi32(256),
5655                arr.as_ptr(),
5656                _mm_setr_epi32(0, 16, 64, 96),
5657                _mm_setr_epi32(-1, -1, -1, 0),
5658            )
5659        };
5660        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5661    }
5662
5663    #[simd_test(enable = "avx2")]
5664    fn test_mm256_i32gather_epi32() {
5665        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5666        // A multiplier of 4 is word-addressing
5667        let r = unsafe {
5668            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5669        };
5670        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5671    }
5672
5673    #[simd_test(enable = "avx2")]
5674    fn test_mm256_mask_i32gather_epi32() {
5675        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5676        // A multiplier of 4 is word-addressing
5677        let r = unsafe {
5678            _mm256_mask_i32gather_epi32::<4>(
5679                _mm256_set1_epi32(256),
5680                arr.as_ptr(),
5681                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5682                _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5683            )
5684        };
5685        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5686    }
5687
5688    #[simd_test(enable = "avx2")]
5689    fn test_mm_i32gather_ps() {
5690        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5691        // A multiplier of 4 is word-addressing for f32s
5692        let r = unsafe { _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5693        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5694    }
5695
5696    #[simd_test(enable = "avx2")]
5697    fn test_mm_mask_i32gather_ps() {
5698        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5699        // A multiplier of 4 is word-addressing for f32s
5700        let r = unsafe {
5701            _mm_mask_i32gather_ps::<4>(
5702                _mm_set1_ps(256.0),
5703                arr.as_ptr(),
5704                _mm_setr_epi32(0, 16, 64, 96),
5705                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5706            )
5707        };
5708        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5709    }
5710
5711    #[simd_test(enable = "avx2")]
5712    fn test_mm256_i32gather_ps() {
5713        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5714        // A multiplier of 4 is word-addressing for f32s
5715        let r = unsafe {
5716            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5717        };
5718        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5719    }
5720
5721    #[simd_test(enable = "avx2")]
5722    fn test_mm256_mask_i32gather_ps() {
5723        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5724        // A multiplier of 4 is word-addressing for f32s
5725        let r = unsafe {
5726            _mm256_mask_i32gather_ps::<4>(
5727                _mm256_set1_ps(256.0),
5728                arr.as_ptr(),
5729                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5730                _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5731            )
5732        };
5733        assert_eq_m256(
5734            r,
5735            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5736        );
5737    }
5738
5739    #[simd_test(enable = "avx2")]
5740    fn test_mm_i32gather_epi64() {
5741        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5742        // A multiplier of 8 is word-addressing for i64s
5743        let r = unsafe { _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5744        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5745    }
5746
5747    #[simd_test(enable = "avx2")]
5748    fn test_mm_mask_i32gather_epi64() {
5749        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5750        // A multiplier of 8 is word-addressing for i64s
5751        let r = unsafe {
5752            _mm_mask_i32gather_epi64::<8>(
5753                _mm_set1_epi64x(256),
5754                arr.as_ptr(),
5755                _mm_setr_epi32(16, 16, 16, 16),
5756                _mm_setr_epi64x(-1, 0),
5757            )
5758        };
5759        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5760    }
5761
5762    #[simd_test(enable = "avx2")]
5763    fn test_mm256_i32gather_epi64() {
5764        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5765        // A multiplier of 8 is word-addressing for i64s
5766        let r = unsafe { _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5767        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5768    }
5769
5770    #[simd_test(enable = "avx2")]
5771    fn test_mm256_mask_i32gather_epi64() {
5772        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5773        // A multiplier of 8 is word-addressing for i64s
5774        let r = unsafe {
5775            _mm256_mask_i32gather_epi64::<8>(
5776                _mm256_set1_epi64x(256),
5777                arr.as_ptr(),
5778                _mm_setr_epi32(0, 16, 64, 96),
5779                _mm256_setr_epi64x(-1, -1, -1, 0),
5780            )
5781        };
5782        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5783    }
5784
5785    #[simd_test(enable = "avx2")]
5786    fn test_mm_i32gather_pd() {
5787        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5788        // A multiplier of 8 is word-addressing for f64s
5789        let r = unsafe { _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5790        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5791    }
5792
5793    #[simd_test(enable = "avx2")]
5794    fn test_mm_mask_i32gather_pd() {
5795        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5796        // A multiplier of 8 is word-addressing for f64s
5797        let r = unsafe {
5798            _mm_mask_i32gather_pd::<8>(
5799                _mm_set1_pd(256.0),
5800                arr.as_ptr(),
5801                _mm_setr_epi32(16, 16, 16, 16),
5802                _mm_setr_pd(-1.0, 0.0),
5803            )
5804        };
5805        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5806    }
5807
5808    #[simd_test(enable = "avx2")]
5809    fn test_mm256_i32gather_pd() {
5810        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5811        // A multiplier of 8 is word-addressing for f64s
5812        let r = unsafe { _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5813        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5814    }
5815
5816    #[simd_test(enable = "avx2")]
5817    fn test_mm256_mask_i32gather_pd() {
5818        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5819        // A multiplier of 8 is word-addressing for f64s
5820        let r = unsafe {
5821            _mm256_mask_i32gather_pd::<8>(
5822                _mm256_set1_pd(256.0),
5823                arr.as_ptr(),
5824                _mm_setr_epi32(0, 16, 64, 96),
5825                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5826            )
5827        };
5828        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5829    }
5830
5831    #[simd_test(enable = "avx2")]
5832    fn test_mm_i64gather_epi32() {
5833        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5834        // A multiplier of 4 is word-addressing
5835        let r = unsafe { _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5836        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5837    }
5838
5839    #[simd_test(enable = "avx2")]
5840    fn test_mm_mask_i64gather_epi32() {
5841        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5842        // A multiplier of 4 is word-addressing
5843        let r = unsafe {
5844            _mm_mask_i64gather_epi32::<4>(
5845                _mm_set1_epi32(256),
5846                arr.as_ptr(),
5847                _mm_setr_epi64x(0, 16),
5848                _mm_setr_epi32(-1, 0, -1, 0),
5849            )
5850        };
5851        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5852    }
5853
5854    #[simd_test(enable = "avx2")]
5855    fn test_mm256_i64gather_epi32() {
5856        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5857        // A multiplier of 4 is word-addressing
5858        let r =
5859            unsafe { _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5860        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5861    }
5862
5863    #[simd_test(enable = "avx2")]
5864    fn test_mm256_mask_i64gather_epi32() {
5865        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5866        // A multiplier of 4 is word-addressing
5867        let r = unsafe {
5868            _mm256_mask_i64gather_epi32::<4>(
5869                _mm_set1_epi32(256),
5870                arr.as_ptr(),
5871                _mm256_setr_epi64x(0, 16, 64, 96),
5872                _mm_setr_epi32(-1, -1, -1, 0),
5873            )
5874        };
5875        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5876    }
5877
5878    #[simd_test(enable = "avx2")]
5879    fn test_mm_i64gather_ps() {
5880        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5881        // A multiplier of 4 is word-addressing for f32s
5882        let r = unsafe { _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5883        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5884    }
5885
5886    #[simd_test(enable = "avx2")]
5887    fn test_mm_mask_i64gather_ps() {
5888        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5889        // A multiplier of 4 is word-addressing for f32s
5890        let r = unsafe {
5891            _mm_mask_i64gather_ps::<4>(
5892                _mm_set1_ps(256.0),
5893                arr.as_ptr(),
5894                _mm_setr_epi64x(0, 16),
5895                _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5896            )
5897        };
5898        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5899    }
5900
5901    #[simd_test(enable = "avx2")]
5902    fn test_mm256_i64gather_ps() {
5903        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5904        // A multiplier of 4 is word-addressing for f32s
5905        let r =
5906            unsafe { _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5907        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5908    }
5909
5910    #[simd_test(enable = "avx2")]
5911    fn test_mm256_mask_i64gather_ps() {
5912        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5913        // A multiplier of 4 is word-addressing for f32s
5914        let r = unsafe {
5915            _mm256_mask_i64gather_ps::<4>(
5916                _mm_set1_ps(256.0),
5917                arr.as_ptr(),
5918                _mm256_setr_epi64x(0, 16, 64, 96),
5919                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5920            )
5921        };
5922        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5923    }
5924
5925    #[simd_test(enable = "avx2")]
5926    fn test_mm_i64gather_epi64() {
5927        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5928        // A multiplier of 8 is word-addressing for i64s
5929        let r = unsafe { _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5930        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5931    }
5932
5933    #[simd_test(enable = "avx2")]
5934    fn test_mm_mask_i64gather_epi64() {
5935        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5936        // A multiplier of 8 is word-addressing for i64s
5937        let r = unsafe {
5938            _mm_mask_i64gather_epi64::<8>(
5939                _mm_set1_epi64x(256),
5940                arr.as_ptr(),
5941                _mm_setr_epi64x(16, 16),
5942                _mm_setr_epi64x(-1, 0),
5943            )
5944        };
5945        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5946    }
5947
5948    #[simd_test(enable = "avx2")]
5949    fn test_mm256_i64gather_epi64() {
5950        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5951        // A multiplier of 8 is word-addressing for i64s
5952        let r =
5953            unsafe { _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5954        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5955    }
5956
5957    #[simd_test(enable = "avx2")]
5958    fn test_mm256_mask_i64gather_epi64() {
5959        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5960        // A multiplier of 8 is word-addressing for i64s
5961        let r = unsafe {
5962            _mm256_mask_i64gather_epi64::<8>(
5963                _mm256_set1_epi64x(256),
5964                arr.as_ptr(),
5965                _mm256_setr_epi64x(0, 16, 64, 96),
5966                _mm256_setr_epi64x(-1, -1, -1, 0),
5967            )
5968        };
5969        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5970    }
5971
5972    #[simd_test(enable = "avx2")]
5973    fn test_mm_i64gather_pd() {
5974        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5975        // A multiplier of 8 is word-addressing for f64s
5976        let r = unsafe { _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5977        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5978    }
5979
5980    #[simd_test(enable = "avx2")]
5981    fn test_mm_mask_i64gather_pd() {
5982        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5983        // A multiplier of 8 is word-addressing for f64s
5984        let r = unsafe {
5985            _mm_mask_i64gather_pd::<8>(
5986                _mm_set1_pd(256.0),
5987                arr.as_ptr(),
5988                _mm_setr_epi64x(16, 16),
5989                _mm_setr_pd(-1.0, 0.0),
5990            )
5991        };
5992        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5993    }
5994
5995    #[simd_test(enable = "avx2")]
5996    fn test_mm256_i64gather_pd() {
5997        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5998        // A multiplier of 8 is word-addressing for f64s
5999        let r =
6000            unsafe { _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
6001        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6002    }
6003
6004    #[simd_test(enable = "avx2")]
6005    fn test_mm256_mask_i64gather_pd() {
6006        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
6007        // A multiplier of 8 is word-addressing for f64s
6008        let r = unsafe {
6009            _mm256_mask_i64gather_pd::<8>(
6010                _mm256_set1_pd(256.0),
6011                arr.as_ptr(),
6012                _mm256_setr_epi64x(0, 16, 64, 96),
6013                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6014            )
6015        };
6016        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6017    }
6018
6019    #[simd_test(enable = "avx2")]
6020    const fn test_mm256_extract_epi8() {
6021        #[rustfmt::skip]
6022        let a = _mm256_setr_epi8(
6023            -1, 1, 2, 3, 4, 5, 6, 7,
6024            8, 9, 10, 11, 12, 13, 14, 15,
6025            16, 17, 18, 19, 20, 21, 22, 23,
6026            24, 25, 26, 27, 28, 29, 30, 31
6027        );
6028        let r1 = _mm256_extract_epi8::<0>(a);
6029        let r2 = _mm256_extract_epi8::<3>(a);
6030        assert_eq!(r1, 0xFF);
6031        assert_eq!(r2, 3);
6032    }
6033
6034    #[simd_test(enable = "avx2")]
6035    const fn test_mm256_extract_epi16() {
6036        #[rustfmt::skip]
6037        let a = _mm256_setr_epi16(
6038            -1, 1, 2, 3, 4, 5, 6, 7,
6039            8, 9, 10, 11, 12, 13, 14, 15,
6040        );
6041        let r1 = _mm256_extract_epi16::<0>(a);
6042        let r2 = _mm256_extract_epi16::<3>(a);
6043        assert_eq!(r1, 0xFFFF);
6044        assert_eq!(r2, 3);
6045    }
6046}