Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse41.rs

1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9// SSE4 rounding constants
10/// round to nearest
11#[stable(feature = "simd_x86", since = "1.27.0")]
12pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
13/// round down
14#[stable(feature = "simd_x86", since = "1.27.0")]
15pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
16/// round up
17#[stable(feature = "simd_x86", since = "1.27.0")]
18pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
19/// truncate
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
22/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23#[stable(feature = "simd_x86", since = "1.27.0")]
24pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
25/// do not suppress exceptions
26#[stable(feature = "simd_x86", since = "1.27.0")]
27pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
28/// suppress exceptions
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub const _MM_FROUND_NO_EXC: i32 = 0x08;
31/// round to nearest and do not suppress exceptions
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub const _MM_FROUND_NINT: i32 = 0x00;
34/// round down and do not suppress exceptions
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
37/// round up and do not suppress exceptions
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
40/// truncate and do not suppress exceptions
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
43/// use MXCSR.RC and do not suppress exceptions; see
44/// `vendor::_MM_SET_ROUNDING_MODE`
45#[stable(feature = "simd_x86", since = "1.27.0")]
46pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
47/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
50
51/// Blend packed 8-bit integers from `a` and `b` using `mask`
52///
53/// The high bit of each corresponding mask byte determines the selection.
54/// If the high bit is set, the element of `b` is selected.
55/// Otherwise, the element of `a` is selected.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58#[inline]
59#[target_feature(enable = "sse4.1")]
60#[cfg_attr(test, assert_instr(pblendvb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
64    unsafe {
65        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
66        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
67    }
68}
69
70/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
71///
72/// The mask bits determine the selection. A clear bit selects the
73/// corresponding element of `a`, and a set bit the corresponding
74/// element of `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
77#[inline]
78#[target_feature(enable = "sse4.1")]
79#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
80#[rustc_legacy_const_generics(2)]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
84    static_assert_uimm_bits!(IMM8, 8);
85    unsafe {
86        transmute::<i16x8, _>(simd_shuffle!(
87            a.as_i16x8(),
88            b.as_i16x8(),
89            [
90                [0, 8][IMM8 as usize & 1],
91                [1, 9][(IMM8 >> 1) as usize & 1],
92                [2, 10][(IMM8 >> 2) as usize & 1],
93                [3, 11][(IMM8 >> 3) as usize & 1],
94                [4, 12][(IMM8 >> 4) as usize & 1],
95                [5, 13][(IMM8 >> 5) as usize & 1],
96                [6, 14][(IMM8 >> 6) as usize & 1],
97                [7, 15][(IMM8 >> 7) as usize & 1],
98            ]
99        ))
100    }
101}
102
103/// Blend packed double-precision (64-bit) floating-point elements from `a`
104/// and `b` using `mask`
105///
106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
107#[inline]
108#[target_feature(enable = "sse4.1")]
109#[cfg_attr(test, assert_instr(blendvpd))]
110#[stable(feature = "simd_x86", since = "1.27.0")]
111#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
112pub const fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
113    unsafe {
114        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
115        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
116    }
117}
118
119/// Blend packed single-precision (32-bit) floating-point elements from `a`
120/// and `b` using `mask`
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
123#[inline]
124#[target_feature(enable = "sse4.1")]
125#[cfg_attr(test, assert_instr(blendvps))]
126#[stable(feature = "simd_x86", since = "1.27.0")]
127#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
128pub const fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
129    unsafe {
130        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
131        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
132    }
133}
134
135/// Blend packed double-precision (64-bit) floating-point elements from `a`
136/// and `b` using control mask `IMM2`
137///
138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
139#[inline]
140#[target_feature(enable = "sse4.1")]
141// Note: LLVM7 prefers the single-precision floating-point domain when possible
142// see https://bugs.llvm.org/show_bug.cgi?id=38195
143// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
144#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
145#[rustc_legacy_const_generics(2)]
146#[stable(feature = "simd_x86", since = "1.27.0")]
147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
148pub const fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
149    static_assert_uimm_bits!(IMM2, 2);
150    unsafe {
151        transmute::<f64x2, _>(simd_shuffle!(
152            a.as_f64x2(),
153            b.as_f64x2(),
154            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
155        ))
156    }
157}
158
159/// Blend packed single-precision (32-bit) floating-point elements from `a`
160/// and `b` using mask `IMM4`
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
163#[inline]
164#[target_feature(enable = "sse4.1")]
165#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
166#[rustc_legacy_const_generics(2)]
167#[stable(feature = "simd_x86", since = "1.27.0")]
168#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
169pub const fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
170    static_assert_uimm_bits!(IMM4, 4);
171    unsafe {
172        transmute::<f32x4, _>(simd_shuffle!(
173            a.as_f32x4(),
174            b.as_f32x4(),
175            [
176                [0, 4][IMM4 as usize & 1],
177                [1, 5][(IMM4 >> 1) as usize & 1],
178                [2, 6][(IMM4 >> 2) as usize & 1],
179                [3, 7][(IMM4 >> 3) as usize & 1],
180            ]
181        ))
182    }
183}
184
185/// Extracts a single-precision (32-bit) floating-point element from `a`,
186/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
187/// and may be converted back to a floating point number via casting.
188///
189/// # Example
190/// ```rust
191/// # #[cfg(target_arch = "x86")]
192/// # use std::arch::x86::*;
193/// # #[cfg(target_arch = "x86_64")]
194/// # use std::arch::x86_64::*;
195/// # fn main() {
196/// #    if is_x86_feature_detected!("sse4.1") {
197/// #       #[target_feature(enable = "sse4.1")]
198/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
199/// #       unsafe fn worker() { unsafe {
200/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
201/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
202/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
203/// float_store.push(f32::from_bits(x as u32));
204/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
205/// #       }}
206/// #       unsafe { worker() }
207/// #   }
208/// # }
209/// ```
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
211#[inline]
212#[target_feature(enable = "sse4.1")]
213#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
214#[rustc_legacy_const_generics(1)]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
217pub const fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
218    static_assert_uimm_bits!(IMM8, 2);
219    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
220}
221
222/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
223/// integer containing the zero-extended integer data.
224///
225/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
226///
227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
228#[inline]
229#[target_feature(enable = "sse4.1")]
230#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
231#[rustc_legacy_const_generics(1)]
232#[stable(feature = "simd_x86", since = "1.27.0")]
233#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
234pub const fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
235    static_assert_uimm_bits!(IMM8, 4);
236    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
237}
238
239/// Extracts an 32-bit integer from `a` selected with `IMM8`
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
242#[inline]
243#[target_feature(enable = "sse4.1")]
244#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
245#[rustc_legacy_const_generics(1)]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
248pub const fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
249    static_assert_uimm_bits!(IMM8, 2);
250    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
251}
252
253/// Select a single value in `b` to store at some position in `a`,
254/// Then zero elements according to `IMM8`.
255///
256/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
257/// the result they will be copied to, and which bits in the result will be
258/// cleared. The following assignments are made:
259///
260/// * Bits `[7:6]` specify the bits to copy from operand `b`:
261///     - `00`: Selects bits `[31:0]` from operand `b`.
262///     - `01`: Selects bits `[63:32]` from operand `b`.
263///     - `10`: Selects bits `[95:64]` from operand `b`.
264///     - `11`: Selects bits `[127:96]` from operand `b`.
265///
266/// * Bits `[5:4]` specify the bits in the result to which the selected bits
267///   from operand `b` are copied:
268///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
269///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
270///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
271///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
272///
273/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
274///   element is cleared.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
277#[inline]
278#[target_feature(enable = "sse4.1")]
279#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
280#[rustc_legacy_const_generics(2)]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
283    static_assert_uimm_bits!(IMM8, 8);
284    unsafe { insertps(a, b, IMM8 as u8) }
285}
286
287/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
288/// location specified by `IMM8`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
291#[inline]
292#[target_feature(enable = "sse4.1")]
293#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
294#[rustc_legacy_const_generics(2)]
295#[stable(feature = "simd_x86", since = "1.27.0")]
296#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
297pub const fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
298    static_assert_uimm_bits!(IMM8, 4);
299    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
300}
301
302/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
303/// location specified by `IMM8`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
306#[inline]
307#[target_feature(enable = "sse4.1")]
308#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
309#[rustc_legacy_const_generics(2)]
310#[stable(feature = "simd_x86", since = "1.27.0")]
311#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
312pub const fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
313    static_assert_uimm_bits!(IMM8, 2);
314    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
315}
316
317/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
318/// values in dst.
319///
320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
321#[inline]
322#[target_feature(enable = "sse4.1")]
323#[cfg_attr(test, assert_instr(pmaxsb))]
324#[stable(feature = "simd_x86", since = "1.27.0")]
325#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
326pub const fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
327    unsafe { simd_imax(a.as_i8x16(), b.as_i8x16()).as_m128i() }
328}
329
330/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
331/// maximum.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
334#[inline]
335#[target_feature(enable = "sse4.1")]
336#[cfg_attr(test, assert_instr(pmaxuw))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
339pub const fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
340    unsafe { simd_imax(a.as_u16x8(), b.as_u16x8()).as_m128i() }
341}
342
343/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
344/// values.
345///
346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
347#[inline]
348#[target_feature(enable = "sse4.1")]
349#[cfg_attr(test, assert_instr(pmaxsd))]
350#[stable(feature = "simd_x86", since = "1.27.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
353    unsafe { simd_imax(a.as_i32x4(), b.as_i32x4()).as_m128i() }
354}
355
356/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
357/// maximum values.
358///
359/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
360#[inline]
361#[target_feature(enable = "sse4.1")]
362#[cfg_attr(test, assert_instr(pmaxud))]
363#[stable(feature = "simd_x86", since = "1.27.0")]
364#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
365pub const fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { simd_imax(a.as_u32x4(), b.as_u32x4()).as_m128i() }
367}
368
369/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
370/// values in dst.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
373#[inline]
374#[target_feature(enable = "sse4.1")]
375#[cfg_attr(test, assert_instr(pminsb))]
376#[stable(feature = "simd_x86", since = "1.27.0")]
377#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
378pub const fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
379    unsafe { simd_imin(a.as_i8x16(), b.as_i8x16()).as_m128i() }
380}
381
382/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
383/// minimum.
384///
385/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
386#[inline]
387#[target_feature(enable = "sse4.1")]
388#[cfg_attr(test, assert_instr(pminuw))]
389#[stable(feature = "simd_x86", since = "1.27.0")]
390#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
391pub const fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
392    unsafe { simd_imin(a.as_u16x8(), b.as_u16x8()).as_m128i() }
393}
394
395/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
396/// values.
397///
398/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
399#[inline]
400#[target_feature(enable = "sse4.1")]
401#[cfg_attr(test, assert_instr(pminsd))]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
404pub const fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
405    unsafe { simd_imin(a.as_i32x4(), b.as_i32x4()).as_m128i() }
406}
407
408/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
409/// minimum values.
410///
411/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
412#[inline]
413#[target_feature(enable = "sse4.1")]
414#[cfg_attr(test, assert_instr(pminud))]
415#[stable(feature = "simd_x86", since = "1.27.0")]
416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
417pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
418    unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() }
419}
420
421/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
422/// using unsigned saturation
423///
424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
425#[inline]
426#[target_feature(enable = "sse4.1")]
427#[cfg_attr(test, assert_instr(packusdw))]
428#[stable(feature = "simd_x86", since = "1.27.0")]
429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
430pub const fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
431    unsafe {
432        let max = simd_splat(u16::MAX as i32);
433        let min = simd_splat(u16::MIN as i32);
434
435        let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min)
436            .as_m128i()
437            .as_i16x8();
438        let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min)
439            .as_m128i()
440            .as_i16x8();
441
442        // Shuffle the low u16 of each i32 from two concatenated vectors into
443        // the low bits of the result register.
444        const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14];
445        let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS);
446
447        result.as_m128i()
448    }
449}
450
451/// Compares packed 64-bit integers in `a` and `b` for equality
452///
453/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
454#[inline]
455#[target_feature(enable = "sse4.1")]
456#[cfg_attr(test, assert_instr(pcmpeqq))]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
459pub const fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
460    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
461}
462
463/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
464///
465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
466#[inline]
467#[target_feature(enable = "sse4.1")]
468#[cfg_attr(test, assert_instr(pmovsxbw))]
469#[stable(feature = "simd_x86", since = "1.27.0")]
470#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
471pub const fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
472    unsafe {
473        let a = a.as_i8x16();
474        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
475        transmute(simd_cast::<_, i16x8>(a))
476    }
477}
478
479/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
480///
481/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
482#[inline]
483#[target_feature(enable = "sse4.1")]
484#[cfg_attr(test, assert_instr(pmovsxbd))]
485#[stable(feature = "simd_x86", since = "1.27.0")]
486#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
487pub const fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
488    unsafe {
489        let a = a.as_i8x16();
490        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
491        transmute(simd_cast::<_, i32x4>(a))
492    }
493}
494
495/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
496/// 64-bit integers
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
499#[inline]
500#[target_feature(enable = "sse4.1")]
501#[cfg_attr(test, assert_instr(pmovsxbq))]
502#[stable(feature = "simd_x86", since = "1.27.0")]
503#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
504pub const fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
505    unsafe {
506        let a = a.as_i8x16();
507        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
508        transmute(simd_cast::<_, i64x2>(a))
509    }
510}
511
512/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
513///
514/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
515#[inline]
516#[target_feature(enable = "sse4.1")]
517#[cfg_attr(test, assert_instr(pmovsxwd))]
518#[stable(feature = "simd_x86", since = "1.27.0")]
519#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
520pub const fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
521    unsafe {
522        let a = a.as_i16x8();
523        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
524        transmute(simd_cast::<_, i32x4>(a))
525    }
526}
527
528/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
529///
530/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
531#[inline]
532#[target_feature(enable = "sse4.1")]
533#[cfg_attr(test, assert_instr(pmovsxwq))]
534#[stable(feature = "simd_x86", since = "1.27.0")]
535#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
536pub const fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
537    unsafe {
538        let a = a.as_i16x8();
539        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
540        transmute(simd_cast::<_, i64x2>(a))
541    }
542}
543
544/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
545///
546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
547#[inline]
548#[target_feature(enable = "sse4.1")]
549#[cfg_attr(test, assert_instr(pmovsxdq))]
550#[stable(feature = "simd_x86", since = "1.27.0")]
551#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
552pub const fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
553    unsafe {
554        let a = a.as_i32x4();
555        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
556        transmute(simd_cast::<_, i64x2>(a))
557    }
558}
559
560/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
561///
562/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
563#[inline]
564#[target_feature(enable = "sse4.1")]
565#[cfg_attr(test, assert_instr(pmovzxbw))]
566#[stable(feature = "simd_x86", since = "1.27.0")]
567#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
568pub const fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
569    unsafe {
570        let a = a.as_u8x16();
571        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
572        transmute(simd_cast::<_, i16x8>(a))
573    }
574}
575
576/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
577///
578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
579#[inline]
580#[target_feature(enable = "sse4.1")]
581#[cfg_attr(test, assert_instr(pmovzxbd))]
582#[stable(feature = "simd_x86", since = "1.27.0")]
583#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
584pub const fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
585    unsafe {
586        let a = a.as_u8x16();
587        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
588        transmute(simd_cast::<_, i32x4>(a))
589    }
590}
591
592/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
593///
594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
595#[inline]
596#[target_feature(enable = "sse4.1")]
597#[cfg_attr(test, assert_instr(pmovzxbq))]
598#[stable(feature = "simd_x86", since = "1.27.0")]
599#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
600pub const fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
601    unsafe {
602        let a = a.as_u8x16();
603        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
604        transmute(simd_cast::<_, i64x2>(a))
605    }
606}
607
608/// Zeroes extend packed unsigned 16-bit integers in `a`
609/// to packed 32-bit integers
610///
611/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
612#[inline]
613#[target_feature(enable = "sse4.1")]
614#[cfg_attr(test, assert_instr(pmovzxwd))]
615#[stable(feature = "simd_x86", since = "1.27.0")]
616#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
617pub const fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
618    unsafe {
619        let a = a.as_u16x8();
620        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
621        transmute(simd_cast::<_, i32x4>(a))
622    }
623}
624
625/// Zeroes extend packed unsigned 16-bit integers in `a`
626/// to packed 64-bit integers
627///
628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
629#[inline]
630#[target_feature(enable = "sse4.1")]
631#[cfg_attr(test, assert_instr(pmovzxwq))]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
634pub const fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
635    unsafe {
636        let a = a.as_u16x8();
637        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
638        transmute(simd_cast::<_, i64x2>(a))
639    }
640}
641
642/// Zeroes extend packed unsigned 32-bit integers in `a`
643/// to packed 64-bit integers
644///
645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
646#[inline]
647#[target_feature(enable = "sse4.1")]
648#[cfg_attr(test, assert_instr(pmovzxdq))]
649#[stable(feature = "simd_x86", since = "1.27.0")]
650#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
651pub const fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
652    unsafe {
653        let a = a.as_u32x4();
654        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
655        transmute(simd_cast::<_, i64x2>(a))
656    }
657}
658
659/// Returns the dot product of two __m128d vectors.
660///
661/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
662/// If a condition mask bit is zero, the corresponding multiplication is
663/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
664/// the dot product will be stored in the return value component. Otherwise if
665/// the broadcast mask bit is zero then the return component will be zero.
666///
667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
668#[inline]
669#[target_feature(enable = "sse4.1")]
670#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
671#[rustc_legacy_const_generics(2)]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
674    unsafe {
675        static_assert_uimm_bits!(IMM8, 8);
676        dppd(a, b, IMM8 as u8)
677    }
678}
679
680/// Returns the dot product of two __m128 vectors.
681///
682/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
683/// If a condition mask bit is zero, the corresponding multiplication is
684/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
685/// the dot product will be stored in the return value component. Otherwise if
686/// the broadcast mask bit is zero then the return component will be zero.
687///
688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
689#[inline]
690#[target_feature(enable = "sse4.1")]
691#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
692#[rustc_legacy_const_generics(2)]
693#[stable(feature = "simd_x86", since = "1.27.0")]
694pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
695    static_assert_uimm_bits!(IMM8, 8);
696    unsafe { dpps(a, b, IMM8 as u8) }
697}
698
699/// Round the packed double-precision (64-bit) floating-point elements in `a`
700/// down to an integer value, and stores the results as packed double-precision
701/// floating-point elements.
702///
703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
704#[inline]
705#[target_feature(enable = "sse4.1")]
706#[cfg_attr(test, assert_instr(roundpd))]
707#[stable(feature = "simd_x86", since = "1.27.0")]
708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
709pub const fn _mm_floor_pd(a: __m128d) -> __m128d {
710    unsafe { simd_floor(a) }
711}
712
713/// Round the packed single-precision (32-bit) floating-point elements in `a`
714/// down to an integer value, and stores the results as packed single-precision
715/// floating-point elements.
716///
717/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
718#[inline]
719#[target_feature(enable = "sse4.1")]
720#[cfg_attr(test, assert_instr(roundps))]
721#[stable(feature = "simd_x86", since = "1.27.0")]
722#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
723pub const fn _mm_floor_ps(a: __m128) -> __m128 {
724    unsafe { simd_floor(a) }
725}
726
727/// Round the lower double-precision (64-bit) floating-point element in `b`
728/// down to an integer value, store the result as a double-precision
729/// floating-point element in the lower element of the intrinsic result,
730/// and copies the upper element from `a` to the upper element of the intrinsic
731/// result.
732///
733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
734#[inline]
735#[target_feature(enable = "sse4.1")]
736#[cfg_attr(test, assert_instr(roundsd))]
737#[stable(feature = "simd_x86", since = "1.27.0")]
738pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
739    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
740}
741
742/// Round the lower single-precision (32-bit) floating-point element in `b`
743/// down to an integer value, store the result as a single-precision
744/// floating-point element in the lower element of the intrinsic result,
745/// and copies the upper 3 packed elements from `a` to the upper elements
746/// of the intrinsic result.
747///
748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
749#[inline]
750#[target_feature(enable = "sse4.1")]
751#[cfg_attr(test, assert_instr(roundss))]
752#[stable(feature = "simd_x86", since = "1.27.0")]
753pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
754    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
755}
756
757/// Round the packed double-precision (64-bit) floating-point elements in `a`
758/// up to an integer value, and stores the results as packed double-precision
759/// floating-point elements.
760///
761/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
762#[inline]
763#[target_feature(enable = "sse4.1")]
764#[cfg_attr(test, assert_instr(roundpd))]
765#[stable(feature = "simd_x86", since = "1.27.0")]
766#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
767pub const fn _mm_ceil_pd(a: __m128d) -> __m128d {
768    unsafe { simd_ceil(a) }
769}
770
771/// Round the packed single-precision (32-bit) floating-point elements in `a`
772/// up to an integer value, and stores the results as packed single-precision
773/// floating-point elements.
774///
775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
776#[inline]
777#[target_feature(enable = "sse4.1")]
778#[cfg_attr(test, assert_instr(roundps))]
779#[stable(feature = "simd_x86", since = "1.27.0")]
780#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
781pub const fn _mm_ceil_ps(a: __m128) -> __m128 {
782    unsafe { simd_ceil(a) }
783}
784
785/// Round the lower double-precision (64-bit) floating-point element in `b`
786/// up to an integer value, store the result as a double-precision
787/// floating-point element in the lower element of the intrinsic result,
788/// and copies the upper element from `a` to the upper element
789/// of the intrinsic result.
790///
791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
792#[inline]
793#[target_feature(enable = "sse4.1")]
794#[cfg_attr(test, assert_instr(roundsd))]
795#[stable(feature = "simd_x86", since = "1.27.0")]
796pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
797    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
798}
799
800/// Round the lower single-precision (32-bit) floating-point element in `b`
801/// up to an integer value, store the result as a single-precision
802/// floating-point element in the lower element of the intrinsic result,
803/// and copies the upper 3 packed elements from `a` to the upper elements
804/// of the intrinsic result.
805///
806/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
807#[inline]
808#[target_feature(enable = "sse4.1")]
809#[cfg_attr(test, assert_instr(roundss))]
810#[stable(feature = "simd_x86", since = "1.27.0")]
811pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
812    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
813}
814
815/// Round the packed double-precision (64-bit) floating-point elements in `a`
816/// using the `ROUNDING` parameter, and stores the results as packed
817/// double-precision floating-point elements.
818/// Rounding is done according to the rounding parameter, which can be one of:
819///
820/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
821/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
822/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
823/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
824/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
825///
826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
827#[inline]
828#[target_feature(enable = "sse4.1")]
829#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
830#[rustc_legacy_const_generics(1)]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
833    static_assert_uimm_bits!(ROUNDING, 4);
834    unsafe { roundpd(a, ROUNDING) }
835}
836
837/// Round the packed single-precision (32-bit) floating-point elements in `a`
838/// using the `ROUNDING` parameter, and stores the results as packed
839/// single-precision floating-point elements.
840/// Rounding is done according to the rounding parameter, which can be one of:
841///
842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
849#[inline]
850#[target_feature(enable = "sse4.1")]
851#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
852#[rustc_legacy_const_generics(1)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
855    static_assert_uimm_bits!(ROUNDING, 4);
856    unsafe { roundps(a, ROUNDING) }
857}
858
859/// Round the lower double-precision (64-bit) floating-point element in `b`
860/// using the `ROUNDING` parameter, store the result as a double-precision
861/// floating-point element in the lower element of the intrinsic result,
862/// and copies the upper element from `a` to the upper element of the intrinsic
863/// result.
864/// Rounding is done according to the rounding parameter, which can be one of:
865///
866/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
867/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
868/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
869/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
870/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
871///
872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
873#[inline]
874#[target_feature(enable = "sse4.1")]
875#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
876#[rustc_legacy_const_generics(2)]
877#[stable(feature = "simd_x86", since = "1.27.0")]
878pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
879    static_assert_uimm_bits!(ROUNDING, 4);
880    unsafe { roundsd(a, b, ROUNDING) }
881}
882
883/// Round the lower single-precision (32-bit) floating-point element in `b`
884/// using the `ROUNDING` parameter, store the result as a single-precision
885/// floating-point element in the lower element of the intrinsic result,
886/// and copies the upper 3 packed elements from `a` to the upper elements
887/// of the intrinsic result.
888/// Rounding is done according to the rounding parameter, which can be one of:
889///
890/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
891/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
892/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
893/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
894/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
895///
896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
897#[inline]
898#[target_feature(enable = "sse4.1")]
899#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
900#[rustc_legacy_const_generics(2)]
901#[stable(feature = "simd_x86", since = "1.27.0")]
902pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
903    static_assert_uimm_bits!(ROUNDING, 4);
904    unsafe { roundss(a, b, ROUNDING) }
905}
906
907/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
908/// returning a vector containing its value in its first position, and its
909/// index
910/// in its second position; all other elements are set to zero.
911///
912/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
913/// instruction.
914///
915/// Arguments:
916///
917/// * `a` - A 128-bit vector of type `__m128i`.
918///
919/// Returns:
920///
921/// A 128-bit value where:
922///
923/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
924/// * bits `[18:16]` - contain the index of the minimum value
925/// * remaining bits are set to `0`.
926///
927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
928#[inline]
929#[target_feature(enable = "sse4.1")]
930#[cfg_attr(test, assert_instr(phminposuw))]
931#[stable(feature = "simd_x86", since = "1.27.0")]
932pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
933    unsafe { transmute(phminposuw(a.as_u16x8())) }
934}
935
936/// Multiplies the low 32-bit integers from each packed 64-bit
937/// element in `a` and `b`, and returns the signed 64-bit result.
938///
939/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
940#[inline]
941#[target_feature(enable = "sse4.1")]
942#[cfg_attr(test, assert_instr(pmuldq))]
943#[stable(feature = "simd_x86", since = "1.27.0")]
944#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
945pub const fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
946    unsafe {
947        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
948        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
949        transmute(simd_mul(a, b))
950    }
951}
952
953/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
954/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
955/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
956/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
957/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
958/// return a negative number.
959///
960/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
961#[inline]
962#[target_feature(enable = "sse4.1")]
963#[cfg_attr(test, assert_instr(pmulld))]
964#[stable(feature = "simd_x86", since = "1.27.0")]
965#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
966pub const fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
967    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
968}
969
970/// Subtracts 8-bit unsigned integer values and computes the absolute
971/// values of the differences to the corresponding bits in the destination.
972/// Then sums of the absolute differences are returned according to the bit
973/// fields in the immediate operand.
974///
975/// The following algorithm is performed:
976///
977/// ```ignore
978/// i = IMM8[2] * 4
979/// j = IMM8[1:0] * 4
980/// for k := 0 to 7
981///     d0 = abs(a[i + k + 0] - b[j + 0])
982///     d1 = abs(a[i + k + 1] - b[j + 1])
983///     d2 = abs(a[i + k + 2] - b[j + 2])
984///     d3 = abs(a[i + k + 3] - b[j + 3])
985///     r[k] = d0 + d1 + d2 + d3
986/// ```
987///
988/// Arguments:
989///
990/// * `a` - A 128-bit vector of type `__m128i`.
991/// * `b` - A 128-bit vector of type `__m128i`.
992/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
993///   differences are to be calculated
994///     * Bit `[2]` specify the offset for operand `a`
995///     * Bits `[1:0]` specify the offset for operand `b`
996///
997/// Returns:
998///
999/// * A `__m128i` vector containing the sums of the sets of   absolute
1000///   differences between both operands.
1001///
1002/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
1003#[inline]
1004#[target_feature(enable = "sse4.1")]
1005#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
1006#[rustc_legacy_const_generics(2)]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
1009    static_assert_uimm_bits!(IMM8, 3);
1010    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
1011}
1012
1013/// Tests whether the specified bits in a 128-bit integer vector are all
1014/// zeros.
1015///
1016/// Arguments:
1017///
1018/// * `a` - A 128-bit integer vector containing the bits to be tested.
1019/// * `mask` - A 128-bit integer vector selecting which bits to test in
1020///   operand `a`.
1021///
1022/// Returns:
1023///
1024/// * `1` - if the specified bits are all zeros,
1025/// * `0` - otherwise.
1026///
1027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1028#[inline]
1029#[target_feature(enable = "sse4.1")]
1030#[cfg_attr(test, assert_instr(ptest))]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1033pub const fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1034    unsafe {
1035        let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
1036        (0i64 == r) as i32
1037    }
1038}
1039
1040/// Tests whether the specified bits in a 128-bit integer vector are all
1041/// ones.
1042///
1043/// Arguments:
1044///
1045/// * `a` - A 128-bit integer vector containing the bits to be tested.
1046/// * `mask` - A 128-bit integer vector selecting which bits to test in
1047///   operand `a`.
1048///
1049/// Returns:
1050///
1051/// * `1` - if the specified bits are all ones,
1052/// * `0` - otherwise.
1053///
1054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1055#[inline]
1056#[target_feature(enable = "sse4.1")]
1057#[cfg_attr(test, assert_instr(ptest))]
1058#[stable(feature = "simd_x86", since = "1.27.0")]
1059#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1060pub const fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1061    unsafe {
1062        let r = simd_reduce_or(simd_and(
1063            simd_xor(a.as_i64x2(), i64x2::splat(!0)),
1064            mask.as_i64x2(),
1065        ));
1066        (0i64 == r) as i32
1067    }
1068}
1069
1070/// Tests whether the specified bits in a 128-bit integer vector are
1071/// neither all zeros nor all ones.
1072///
1073/// Arguments:
1074///
1075/// * `a` - A 128-bit integer vector containing the bits to be tested.
1076/// * `mask` - A 128-bit integer vector selecting which bits to test in
1077///   operand `a`.
1078///
1079/// Returns:
1080///
1081/// * `1` - if the specified bits are neither all zeros nor all ones,
1082/// * `0` - otherwise.
1083///
1084/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1085#[inline]
1086#[target_feature(enable = "sse4.1")]
1087#[cfg_attr(test, assert_instr(ptest))]
1088#[stable(feature = "simd_x86", since = "1.27.0")]
1089pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1090    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
1091}
1092
1093/// Tests whether the specified bits in a 128-bit integer vector are all
1094/// zeros.
1095///
1096/// Arguments:
1097///
1098/// * `a` - A 128-bit integer vector containing the bits to be tested.
1099/// * `mask` - A 128-bit integer vector selecting which bits to test in
1100///   operand `a`.
1101///
1102/// Returns:
1103///
1104/// * `1` - if the specified bits are all zeros,
1105/// * `0` - otherwise.
1106///
1107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1108#[inline]
1109#[target_feature(enable = "sse4.1")]
1110#[cfg_attr(test, assert_instr(ptest))]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1113pub const fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1114    _mm_testz_si128(a, mask)
1115}
1116
1117/// Tests whether the specified bits in `a` 128-bit integer vector are all
1118/// ones.
1119///
1120/// Argument:
1121///
1122/// * `a` - A 128-bit integer vector containing the bits to be tested.
1123///
1124/// Returns:
1125///
1126/// * `1` - if the bits specified in the operand are all set to 1,
1127/// * `0` - otherwise.
1128///
1129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1130#[inline]
1131#[target_feature(enable = "sse4.1")]
1132#[cfg_attr(test, assert_instr(pcmpeqd))]
1133#[cfg_attr(test, assert_instr(ptest))]
1134#[stable(feature = "simd_x86", since = "1.27.0")]
1135#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1136pub const fn _mm_test_all_ones(a: __m128i) -> i32 {
1137    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1138}
1139
1140/// Tests whether the specified bits in a 128-bit integer vector are
1141/// neither all zeros nor all ones.
1142///
1143/// Arguments:
1144///
1145/// * `a` - A 128-bit integer vector containing the bits to be tested.
1146/// * `mask` - A 128-bit integer vector selecting which bits to test in
1147///   operand `a`.
1148///
1149/// Returns:
1150///
1151/// * `1` - if the specified bits are neither all zeros nor all ones,
1152/// * `0` - otherwise.
1153///
1154/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1155#[inline]
1156#[target_feature(enable = "sse4.1")]
1157#[cfg_attr(test, assert_instr(ptest))]
1158#[stable(feature = "simd_x86", since = "1.27.0")]
1159pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1160    _mm_testnzc_si128(a, mask)
1161}
1162
1163/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
1164/// boundary or a general-protection exception may be generated. To minimize caching, the data
1165/// is flagged as non-temporal (unlikely to be used again soon)
1166///
1167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1168#[inline]
1169#[target_feature(enable = "sse4.1")]
1170#[cfg_attr(test, assert_instr(movntdqa))]
1171#[stable(feature = "simd_x86_updates", since = "1.82.0")]
1172pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1173    let dst: __m128i;
1174    crate::arch::asm!(
1175        vpl!("movntdqa {a}"),
1176        a = out(xmm_reg) dst,
1177        p = in(reg) mem_addr,
1178        options(pure, readonly, nostack, preserves_flags),
1179    );
1180    dst
1181}
1182
1183#[allow(improper_ctypes)]
1184unsafe extern "C" {
1185    #[link_name = "llvm.x86.sse41.insertps"]
1186    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1187    #[link_name = "llvm.x86.sse41.dppd"]
1188    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1189    #[link_name = "llvm.x86.sse41.dpps"]
1190    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1191    #[link_name = "llvm.x86.sse41.round.pd"]
1192    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1193    #[link_name = "llvm.x86.sse41.round.ps"]
1194    fn roundps(a: __m128, rounding: i32) -> __m128;
1195    #[link_name = "llvm.x86.sse41.round.sd"]
1196    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1197    #[link_name = "llvm.x86.sse41.round.ss"]
1198    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1199    #[link_name = "llvm.x86.sse41.phminposuw"]
1200    fn phminposuw(a: u16x8) -> u16x8;
1201    #[link_name = "llvm.x86.sse41.mpsadbw"]
1202    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1203    #[link_name = "llvm.x86.sse41.ptestnzc"]
1204    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1205}
1206
1207#[cfg(test)]
1208mod tests {
1209    use crate::core_arch::assert_eq_const as assert_eq;
1210    use crate::core_arch::x86::*;
1211    use std::mem;
1212    use stdarch_test::simd_test;
1213
1214    #[simd_test(enable = "sse4.1")]
1215    const fn test_mm_blendv_epi8() {
1216        #[rustfmt::skip]
1217        let a = _mm_setr_epi8(
1218            0, 1, 2, 3, 4, 5, 6, 7,
1219            8, 9, 10, 11, 12, 13, 14, 15,
1220        );
1221        #[rustfmt::skip]
1222        let b = _mm_setr_epi8(
1223            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1224        );
1225        #[rustfmt::skip]
1226        let mask = _mm_setr_epi8(
1227            0, -1, 0, -1, 0, -1, 0, -1,
1228            0, -1, 0, -1, 0, -1, 0, -1,
1229        );
1230        #[rustfmt::skip]
1231        let e = _mm_setr_epi8(
1232            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1233        );
1234        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1235    }
1236
1237    #[simd_test(enable = "sse4.1")]
1238    const fn test_mm_blendv_pd() {
1239        let a = _mm_set1_pd(0.0);
1240        let b = _mm_set1_pd(1.0);
1241        let mask = _mm_castsi128_pd(_mm_setr_epi64x(0, -1));
1242        let r = _mm_blendv_pd(a, b, mask);
1243        let e = _mm_setr_pd(0.0, 1.0);
1244        assert_eq_m128d(r, e);
1245    }
1246
1247    #[simd_test(enable = "sse4.1")]
1248    const fn test_mm_blendv_ps() {
1249        let a = _mm_set1_ps(0.0);
1250        let b = _mm_set1_ps(1.0);
1251        let mask = _mm_castsi128_ps(_mm_setr_epi32(0, -1, 0, -1));
1252        let r = _mm_blendv_ps(a, b, mask);
1253        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1254        assert_eq_m128(r, e);
1255    }
1256
1257    #[simd_test(enable = "sse4.1")]
1258    const fn test_mm_blend_pd() {
1259        let a = _mm_set1_pd(0.0);
1260        let b = _mm_set1_pd(1.0);
1261        let r = _mm_blend_pd::<0b10>(a, b);
1262        let e = _mm_setr_pd(0.0, 1.0);
1263        assert_eq_m128d(r, e);
1264    }
1265
1266    #[simd_test(enable = "sse4.1")]
1267    const fn test_mm_blend_ps() {
1268        let a = _mm_set1_ps(0.0);
1269        let b = _mm_set1_ps(1.0);
1270        let r = _mm_blend_ps::<0b1010>(a, b);
1271        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1272        assert_eq_m128(r, e);
1273    }
1274
1275    #[simd_test(enable = "sse4.1")]
1276    const fn test_mm_blend_epi16() {
1277        let a = _mm_set1_epi16(0);
1278        let b = _mm_set1_epi16(1);
1279        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1280        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1281        assert_eq_m128i(r, e);
1282    }
1283
1284    #[simd_test(enable = "sse4.1")]
1285    const fn test_mm_extract_ps() {
1286        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1287        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1288        assert_eq!(r, 1.0);
1289        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1290        assert_eq!(r, 3.0);
1291    }
1292
1293    #[simd_test(enable = "sse4.1")]
1294    const fn test_mm_extract_epi8() {
1295        #[rustfmt::skip]
1296        let a = _mm_setr_epi8(
1297            -1, 1, 2, 3, 4, 5, 6, 7,
1298            8, 9, 10, 11, 12, 13, 14, 15
1299        );
1300        let r1 = _mm_extract_epi8::<0>(a);
1301        let r2 = _mm_extract_epi8::<3>(a);
1302        assert_eq!(r1, 0xFF);
1303        assert_eq!(r2, 3);
1304    }
1305
1306    #[simd_test(enable = "sse4.1")]
1307    const fn test_mm_extract_epi32() {
1308        let a = _mm_setr_epi32(0, 1, 2, 3);
1309        let r = _mm_extract_epi32::<1>(a);
1310        assert_eq!(r, 1);
1311        let r = _mm_extract_epi32::<3>(a);
1312        assert_eq!(r, 3);
1313    }
1314
1315    #[simd_test(enable = "sse4.1")]
1316    fn test_mm_insert_ps() {
1317        let a = _mm_set1_ps(1.0);
1318        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1319        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1320        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1321        assert_eq_m128(r, e);
1322
1323        // Zeroing takes precedence over copied value
1324        let a = _mm_set1_ps(1.0);
1325        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1326        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
1327        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
1328        assert_eq_m128(r, e);
1329    }
1330
1331    #[simd_test(enable = "sse4.1")]
1332    const fn test_mm_insert_epi8() {
1333        let a = _mm_set1_epi8(0);
1334        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1335        let r = _mm_insert_epi8::<1>(a, 32);
1336        assert_eq_m128i(r, e);
1337        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1338        let r = _mm_insert_epi8::<14>(a, 32);
1339        assert_eq_m128i(r, e);
1340    }
1341
1342    #[simd_test(enable = "sse4.1")]
1343    const fn test_mm_insert_epi32() {
1344        let a = _mm_set1_epi32(0);
1345        let e = _mm_setr_epi32(0, 32, 0, 0);
1346        let r = _mm_insert_epi32::<1>(a, 32);
1347        assert_eq_m128i(r, e);
1348        let e = _mm_setr_epi32(0, 0, 0, 32);
1349        let r = _mm_insert_epi32::<3>(a, 32);
1350        assert_eq_m128i(r, e);
1351    }
1352
1353    #[simd_test(enable = "sse4.1")]
1354    const fn test_mm_max_epi8() {
1355        #[rustfmt::skip]
1356        let a = _mm_setr_epi8(
1357            1, 4, 5, 8, 9, 12, 13, 16,
1358            17, 20, 21, 24, 25, 28, 29, 32,
1359        );
1360        #[rustfmt::skip]
1361        let b = _mm_setr_epi8(
1362            2, 3, 6, 7, 10, 11, 14, 15,
1363            18, 19, 22, 23, 26, 27, 30, 31,
1364        );
1365        let r = _mm_max_epi8(a, b);
1366        #[rustfmt::skip]
1367        let e = _mm_setr_epi8(
1368            2, 4, 6, 8, 10, 12, 14, 16,
1369            18, 20, 22, 24, 26, 28, 30, 32,
1370        );
1371        assert_eq_m128i(r, e);
1372    }
1373
1374    #[simd_test(enable = "sse4.1")]
1375    const fn test_mm_max_epu16() {
1376        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1377        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1378        let r = _mm_max_epu16(a, b);
1379        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1380        assert_eq_m128i(r, e);
1381    }
1382
1383    #[simd_test(enable = "sse4.1")]
1384    const fn test_mm_max_epi32() {
1385        let a = _mm_setr_epi32(1, 4, 5, 8);
1386        let b = _mm_setr_epi32(2, 3, 6, 7);
1387        let r = _mm_max_epi32(a, b);
1388        let e = _mm_setr_epi32(2, 4, 6, 8);
1389        assert_eq_m128i(r, e);
1390    }
1391
1392    #[simd_test(enable = "sse4.1")]
1393    const fn test_mm_max_epu32() {
1394        let a = _mm_setr_epi32(1, 4, 5, 8);
1395        let b = _mm_setr_epi32(2, 3, 6, 7);
1396        let r = _mm_max_epu32(a, b);
1397        let e = _mm_setr_epi32(2, 4, 6, 8);
1398        assert_eq_m128i(r, e);
1399    }
1400
1401    #[simd_test(enable = "sse4.1")]
1402    const fn test_mm_min_epi8() {
1403        #[rustfmt::skip]
1404        let a = _mm_setr_epi8(
1405            1, 4, 5, 8, 9, 12, 13, 16,
1406            17, 20, 21, 24, 25, 28, 29, 32,
1407        );
1408        #[rustfmt::skip]
1409        let b = _mm_setr_epi8(
1410            2, 3, 6, 7, 10, 11, 14, 15,
1411            18, 19, 22, 23, 26, 27, 30, 31,
1412        );
1413        let r = _mm_min_epi8(a, b);
1414        #[rustfmt::skip]
1415        let e = _mm_setr_epi8(
1416            1, 3, 5, 7, 9, 11, 13, 15,
1417            17, 19, 21, 23, 25, 27, 29, 31,
1418        );
1419        assert_eq_m128i(r, e);
1420
1421        #[rustfmt::skip]
1422        let a = _mm_setr_epi8(
1423            1, -4, -5, 8, -9, -12, 13, -16,
1424            17, 20, 21, 24, 25, 28, 29, 32,
1425        );
1426        #[rustfmt::skip]
1427        let b = _mm_setr_epi8(
1428            2, -3, -6, 7, -10, -11, 14, -15,
1429            18, 19, 22, 23, 26, 27, 30, 31,
1430        );
1431        let r = _mm_min_epi8(a, b);
1432        #[rustfmt::skip]
1433        let e = _mm_setr_epi8(
1434            1, -4, -6, 7, -10, -12, 13, -16,
1435            17, 19, 21, 23, 25, 27, 29, 31,
1436        );
1437        assert_eq_m128i(r, e);
1438    }
1439
1440    #[simd_test(enable = "sse4.1")]
1441    const fn test_mm_min_epu16() {
1442        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1443        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1444        let r = _mm_min_epu16(a, b);
1445        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1446        assert_eq_m128i(r, e);
1447    }
1448
1449    #[simd_test(enable = "sse4.1")]
1450    const fn test_mm_min_epi32() {
1451        let a = _mm_setr_epi32(1, 4, 5, 8);
1452        let b = _mm_setr_epi32(2, 3, 6, 7);
1453        let r = _mm_min_epi32(a, b);
1454        let e = _mm_setr_epi32(1, 3, 5, 7);
1455        assert_eq_m128i(r, e);
1456
1457        let a = _mm_setr_epi32(-1, 4, 5, -7);
1458        let b = _mm_setr_epi32(-2, 3, -6, 8);
1459        let r = _mm_min_epi32(a, b);
1460        let e = _mm_setr_epi32(-2, 3, -6, -7);
1461        assert_eq_m128i(r, e);
1462    }
1463
1464    #[simd_test(enable = "sse4.1")]
1465    const fn test_mm_min_epu32() {
1466        let a = _mm_setr_epi32(1, 4, 5, 8);
1467        let b = _mm_setr_epi32(2, 3, 6, 7);
1468        let r = _mm_min_epu32(a, b);
1469        let e = _mm_setr_epi32(1, 3, 5, 7);
1470        assert_eq_m128i(r, e);
1471    }
1472
1473    #[simd_test(enable = "sse4.1")]
1474    const fn test_mm_packus_epi32() {
1475        let a = _mm_setr_epi32(1, 2, 3, 4);
1476        let b = _mm_setr_epi32(-1, -2, -3, -4);
1477        let r = _mm_packus_epi32(a, b);
1478        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1479        assert_eq_m128i(r, e);
1480    }
1481
1482    #[simd_test(enable = "sse4.1")]
1483    const fn test_mm_cmpeq_epi64() {
1484        let a = _mm_setr_epi64x(0, 1);
1485        let b = _mm_setr_epi64x(0, 0);
1486        let r = _mm_cmpeq_epi64(a, b);
1487        let e = _mm_setr_epi64x(-1, 0);
1488        assert_eq_m128i(r, e);
1489    }
1490
1491    #[simd_test(enable = "sse4.1")]
1492    const fn test_mm_cvtepi8_epi16() {
1493        let a = _mm_set1_epi8(10);
1494        let r = _mm_cvtepi8_epi16(a);
1495        let e = _mm_set1_epi16(10);
1496        assert_eq_m128i(r, e);
1497        let a = _mm_set1_epi8(-10);
1498        let r = _mm_cvtepi8_epi16(a);
1499        let e = _mm_set1_epi16(-10);
1500        assert_eq_m128i(r, e);
1501    }
1502
1503    #[simd_test(enable = "sse4.1")]
1504    const fn test_mm_cvtepi8_epi32() {
1505        let a = _mm_set1_epi8(10);
1506        let r = _mm_cvtepi8_epi32(a);
1507        let e = _mm_set1_epi32(10);
1508        assert_eq_m128i(r, e);
1509        let a = _mm_set1_epi8(-10);
1510        let r = _mm_cvtepi8_epi32(a);
1511        let e = _mm_set1_epi32(-10);
1512        assert_eq_m128i(r, e);
1513    }
1514
1515    #[simd_test(enable = "sse4.1")]
1516    const fn test_mm_cvtepi8_epi64() {
1517        let a = _mm_set1_epi8(10);
1518        let r = _mm_cvtepi8_epi64(a);
1519        let e = _mm_set1_epi64x(10);
1520        assert_eq_m128i(r, e);
1521        let a = _mm_set1_epi8(-10);
1522        let r = _mm_cvtepi8_epi64(a);
1523        let e = _mm_set1_epi64x(-10);
1524        assert_eq_m128i(r, e);
1525    }
1526
1527    #[simd_test(enable = "sse4.1")]
1528    const fn test_mm_cvtepi16_epi32() {
1529        let a = _mm_set1_epi16(10);
1530        let r = _mm_cvtepi16_epi32(a);
1531        let e = _mm_set1_epi32(10);
1532        assert_eq_m128i(r, e);
1533        let a = _mm_set1_epi16(-10);
1534        let r = _mm_cvtepi16_epi32(a);
1535        let e = _mm_set1_epi32(-10);
1536        assert_eq_m128i(r, e);
1537    }
1538
1539    #[simd_test(enable = "sse4.1")]
1540    const fn test_mm_cvtepi16_epi64() {
1541        let a = _mm_set1_epi16(10);
1542        let r = _mm_cvtepi16_epi64(a);
1543        let e = _mm_set1_epi64x(10);
1544        assert_eq_m128i(r, e);
1545        let a = _mm_set1_epi16(-10);
1546        let r = _mm_cvtepi16_epi64(a);
1547        let e = _mm_set1_epi64x(-10);
1548        assert_eq_m128i(r, e);
1549    }
1550
1551    #[simd_test(enable = "sse4.1")]
1552    const fn test_mm_cvtepi32_epi64() {
1553        let a = _mm_set1_epi32(10);
1554        let r = _mm_cvtepi32_epi64(a);
1555        let e = _mm_set1_epi64x(10);
1556        assert_eq_m128i(r, e);
1557        let a = _mm_set1_epi32(-10);
1558        let r = _mm_cvtepi32_epi64(a);
1559        let e = _mm_set1_epi64x(-10);
1560        assert_eq_m128i(r, e);
1561    }
1562
1563    #[simd_test(enable = "sse4.1")]
1564    const fn test_mm_cvtepu8_epi16() {
1565        let a = _mm_set1_epi8(10);
1566        let r = _mm_cvtepu8_epi16(a);
1567        let e = _mm_set1_epi16(10);
1568        assert_eq_m128i(r, e);
1569    }
1570
1571    #[simd_test(enable = "sse4.1")]
1572    const fn test_mm_cvtepu8_epi32() {
1573        let a = _mm_set1_epi8(10);
1574        let r = _mm_cvtepu8_epi32(a);
1575        let e = _mm_set1_epi32(10);
1576        assert_eq_m128i(r, e);
1577    }
1578
1579    #[simd_test(enable = "sse4.1")]
1580    const fn test_mm_cvtepu8_epi64() {
1581        let a = _mm_set1_epi8(10);
1582        let r = _mm_cvtepu8_epi64(a);
1583        let e = _mm_set1_epi64x(10);
1584        assert_eq_m128i(r, e);
1585    }
1586
1587    #[simd_test(enable = "sse4.1")]
1588    const fn test_mm_cvtepu16_epi32() {
1589        let a = _mm_set1_epi16(10);
1590        let r = _mm_cvtepu16_epi32(a);
1591        let e = _mm_set1_epi32(10);
1592        assert_eq_m128i(r, e);
1593    }
1594
1595    #[simd_test(enable = "sse4.1")]
1596    const fn test_mm_cvtepu16_epi64() {
1597        let a = _mm_set1_epi16(10);
1598        let r = _mm_cvtepu16_epi64(a);
1599        let e = _mm_set1_epi64x(10);
1600        assert_eq_m128i(r, e);
1601    }
1602
1603    #[simd_test(enable = "sse4.1")]
1604    const fn test_mm_cvtepu32_epi64() {
1605        let a = _mm_set1_epi32(10);
1606        let r = _mm_cvtepu32_epi64(a);
1607        let e = _mm_set1_epi64x(10);
1608        assert_eq_m128i(r, e);
1609    }
1610
1611    #[simd_test(enable = "sse4.1")]
1612    fn test_mm_dp_pd() {
1613        let a = _mm_setr_pd(2.0, 3.0);
1614        let b = _mm_setr_pd(1.0, 4.0);
1615        let e = _mm_setr_pd(14.0, 0.0);
1616        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1617    }
1618
1619    #[simd_test(enable = "sse4.1")]
1620    fn test_mm_dp_ps() {
1621        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1622        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1623        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1624        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1625    }
1626
1627    #[simd_test(enable = "sse4.1")]
1628    const fn test_mm_floor_pd() {
1629        let a = _mm_setr_pd(2.5, 4.5);
1630        let r = _mm_floor_pd(a);
1631        let e = _mm_setr_pd(2.0, 4.0);
1632        assert_eq_m128d(r, e);
1633    }
1634
1635    #[simd_test(enable = "sse4.1")]
1636    const fn test_mm_floor_ps() {
1637        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1638        let r = _mm_floor_ps(a);
1639        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1640        assert_eq_m128(r, e);
1641    }
1642
1643    #[simd_test(enable = "sse4.1")]
1644    fn test_mm_floor_sd() {
1645        let a = _mm_setr_pd(2.5, 4.5);
1646        let b = _mm_setr_pd(-1.5, -3.5);
1647        let r = _mm_floor_sd(a, b);
1648        let e = _mm_setr_pd(-2.0, 4.5);
1649        assert_eq_m128d(r, e);
1650    }
1651
1652    #[simd_test(enable = "sse4.1")]
1653    fn test_mm_floor_ss() {
1654        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1655        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1656        let r = _mm_floor_ss(a, b);
1657        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1658        assert_eq_m128(r, e);
1659    }
1660
1661    #[simd_test(enable = "sse4.1")]
1662    const fn test_mm_ceil_pd() {
1663        let a = _mm_setr_pd(1.5, 3.5);
1664        let r = _mm_ceil_pd(a);
1665        let e = _mm_setr_pd(2.0, 4.0);
1666        assert_eq_m128d(r, e);
1667    }
1668
1669    #[simd_test(enable = "sse4.1")]
1670    const fn test_mm_ceil_ps() {
1671        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1672        let r = _mm_ceil_ps(a);
1673        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1674        assert_eq_m128(r, e);
1675    }
1676
1677    #[simd_test(enable = "sse4.1")]
1678    fn test_mm_ceil_sd() {
1679        let a = _mm_setr_pd(1.5, 3.5);
1680        let b = _mm_setr_pd(-2.5, -4.5);
1681        let r = _mm_ceil_sd(a, b);
1682        let e = _mm_setr_pd(-2.0, 3.5);
1683        assert_eq_m128d(r, e);
1684    }
1685
1686    #[simd_test(enable = "sse4.1")]
1687    fn test_mm_ceil_ss() {
1688        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1689        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1690        let r = _mm_ceil_ss(a, b);
1691        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1692        assert_eq_m128(r, e);
1693    }
1694
1695    #[simd_test(enable = "sse4.1")]
1696    fn test_mm_round_pd() {
1697        let a = _mm_setr_pd(1.25, 3.75);
1698        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1699        let e = _mm_setr_pd(1.0, 4.0);
1700        assert_eq_m128d(r, e);
1701    }
1702
1703    #[simd_test(enable = "sse4.1")]
1704    fn test_mm_round_ps() {
1705        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1706        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1707        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1708        assert_eq_m128(r, e);
1709    }
1710
1711    #[simd_test(enable = "sse4.1")]
1712    fn test_mm_round_sd() {
1713        let a = _mm_setr_pd(1.5, 3.5);
1714        let b = _mm_setr_pd(-2.5, -4.5);
1715        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1716        let e = _mm_setr_pd(-2.0, 3.5);
1717        assert_eq_m128d(r, e);
1718
1719        let a = _mm_setr_pd(1.5, 3.5);
1720        let b = _mm_setr_pd(-2.5, -4.5);
1721        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1722        let e = _mm_setr_pd(-3.0, 3.5);
1723        assert_eq_m128d(r, e);
1724
1725        let a = _mm_setr_pd(1.5, 3.5);
1726        let b = _mm_setr_pd(-2.5, -4.5);
1727        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1728        let e = _mm_setr_pd(-2.0, 3.5);
1729        assert_eq_m128d(r, e);
1730
1731        let a = _mm_setr_pd(1.5, 3.5);
1732        let b = _mm_setr_pd(-2.5, -4.5);
1733        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1734        let e = _mm_setr_pd(-2.0, 3.5);
1735        assert_eq_m128d(r, e);
1736    }
1737
1738    #[simd_test(enable = "sse4.1")]
1739    fn test_mm_round_ss() {
1740        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1741        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1742        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1743        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1744        assert_eq_m128(r, e);
1745
1746        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1747        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1748        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1749        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1750        assert_eq_m128(r, e);
1751
1752        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1753        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1754        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1755        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1756        assert_eq_m128(r, e);
1757
1758        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1759        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1760        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1761        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1762        assert_eq_m128(r, e);
1763    }
1764
1765    #[simd_test(enable = "sse4.1")]
1766    fn test_mm_minpos_epu16_1() {
1767        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1768        let r = _mm_minpos_epu16(a);
1769        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1770        assert_eq_m128i(r, e);
1771    }
1772
1773    #[simd_test(enable = "sse4.1")]
1774    fn test_mm_minpos_epu16_2() {
1775        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1776        let r = _mm_minpos_epu16(a);
1777        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1778        assert_eq_m128i(r, e);
1779    }
1780
1781    #[simd_test(enable = "sse4.1")]
1782    fn test_mm_minpos_epu16_3() {
1783        // Case where the minimum value is repeated
1784        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
1785        let r = _mm_minpos_epu16(a);
1786        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1787        assert_eq_m128i(r, e);
1788    }
1789
1790    #[simd_test(enable = "sse4.1")]
1791    const fn test_mm_mul_epi32() {
1792        {
1793            let a = _mm_setr_epi32(1, 1, 1, 1);
1794            let b = _mm_setr_epi32(1, 2, 3, 4);
1795            let r = _mm_mul_epi32(a, b);
1796            let e = _mm_setr_epi64x(1, 3);
1797            assert_eq_m128i(r, e);
1798        }
1799        {
1800            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1801            let b = _mm_setr_epi32(
1802                -20, -256, /* ignored */
1803                666666, 666666, /* ignored */
1804            );
1805            let r = _mm_mul_epi32(a, b);
1806            let e = _mm_setr_epi64x(-300, 823043843622);
1807            assert_eq_m128i(r, e);
1808        }
1809    }
1810
1811    #[simd_test(enable = "sse4.1")]
1812    const fn test_mm_mullo_epi32() {
1813        {
1814            let a = _mm_setr_epi32(1, 1, 1, 1);
1815            let b = _mm_setr_epi32(1, 2, 3, 4);
1816            let r = _mm_mullo_epi32(a, b);
1817            let e = _mm_setr_epi32(1, 2, 3, 4);
1818            assert_eq_m128i(r, e);
1819        }
1820        {
1821            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1822            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1823            let r = _mm_mullo_epi32(a, b);
1824            // Attention, most significant bit in r[2] is treated
1825            // as a sign bit:
1826            // 1234567 * 666666 = -1589877210
1827            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1828            assert_eq_m128i(r, e);
1829        }
1830    }
1831
1832    #[simd_test(enable = "sse4.1")]
1833    fn test_mm_minpos_epu16() {
1834        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1835        let r = _mm_minpos_epu16(a);
1836        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1837        assert_eq_m128i(r, e);
1838    }
1839
1840    #[simd_test(enable = "sse4.1")]
1841    fn test_mm_mpsadbw_epu8() {
1842        #[rustfmt::skip]
1843        let a = _mm_setr_epi8(
1844            0, 1, 2, 3, 4, 5, 6, 7,
1845            8, 9, 10, 11, 12, 13, 14, 15,
1846        );
1847
1848        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1849        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1850        assert_eq_m128i(r, e);
1851
1852        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1853        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1854        assert_eq_m128i(r, e);
1855
1856        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1857        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1858        assert_eq_m128i(r, e);
1859
1860        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1861        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1862        assert_eq_m128i(r, e);
1863
1864        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1865        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1866        assert_eq_m128i(r, e);
1867    }
1868
1869    #[simd_test(enable = "sse4.1")]
1870    const fn test_mm_testz_si128() {
1871        let a = _mm_set1_epi8(1);
1872        let mask = _mm_set1_epi8(0);
1873        let r = _mm_testz_si128(a, mask);
1874        assert_eq!(r, 1);
1875        let a = _mm_set1_epi8(0b101);
1876        let mask = _mm_set1_epi8(0b110);
1877        let r = _mm_testz_si128(a, mask);
1878        assert_eq!(r, 0);
1879        let a = _mm_set1_epi8(0b011);
1880        let mask = _mm_set1_epi8(0b100);
1881        let r = _mm_testz_si128(a, mask);
1882        assert_eq!(r, 1);
1883    }
1884
1885    #[simd_test(enable = "sse4.1")]
1886    const fn test_mm_testc_si128() {
1887        let a = _mm_set1_epi8(-1);
1888        let mask = _mm_set1_epi8(0);
1889        let r = _mm_testc_si128(a, mask);
1890        assert_eq!(r, 1);
1891        let a = _mm_set1_epi8(0b101);
1892        let mask = _mm_set1_epi8(0b110);
1893        let r = _mm_testc_si128(a, mask);
1894        assert_eq!(r, 0);
1895        let a = _mm_set1_epi8(0b101);
1896        let mask = _mm_set1_epi8(0b100);
1897        let r = _mm_testc_si128(a, mask);
1898        assert_eq!(r, 1);
1899    }
1900
1901    #[simd_test(enable = "sse4.1")]
1902    fn test_mm_testnzc_si128() {
1903        let a = _mm_set1_epi8(0);
1904        let mask = _mm_set1_epi8(1);
1905        let r = _mm_testnzc_si128(a, mask);
1906        assert_eq!(r, 0);
1907        let a = _mm_set1_epi8(-1);
1908        let mask = _mm_set1_epi8(0);
1909        let r = _mm_testnzc_si128(a, mask);
1910        assert_eq!(r, 0);
1911        let a = _mm_set1_epi8(0b101);
1912        let mask = _mm_set1_epi8(0b110);
1913        let r = _mm_testnzc_si128(a, mask);
1914        assert_eq!(r, 1);
1915        let a = _mm_set1_epi8(0b101);
1916        let mask = _mm_set1_epi8(0b101);
1917        let r = _mm_testnzc_si128(a, mask);
1918        assert_eq!(r, 0);
1919    }
1920
1921    #[simd_test(enable = "sse4.1")]
1922    const fn test_mm_test_all_zeros() {
1923        let a = _mm_set1_epi8(1);
1924        let mask = _mm_set1_epi8(0);
1925        let r = _mm_test_all_zeros(a, mask);
1926        assert_eq!(r, 1);
1927        let a = _mm_set1_epi8(0b101);
1928        let mask = _mm_set1_epi8(0b110);
1929        let r = _mm_test_all_zeros(a, mask);
1930        assert_eq!(r, 0);
1931        let a = _mm_set1_epi8(0b011);
1932        let mask = _mm_set1_epi8(0b100);
1933        let r = _mm_test_all_zeros(a, mask);
1934        assert_eq!(r, 1);
1935    }
1936
1937    #[simd_test(enable = "sse4.1")]
1938    const fn test_mm_test_all_ones() {
1939        let a = _mm_set1_epi8(-1);
1940        let r = _mm_test_all_ones(a);
1941        assert_eq!(r, 1);
1942        let a = _mm_set1_epi8(0b101);
1943        let r = _mm_test_all_ones(a);
1944        assert_eq!(r, 0);
1945    }
1946
1947    #[simd_test(enable = "sse4.1")]
1948    fn test_mm_test_mix_ones_zeros() {
1949        let a = _mm_set1_epi8(0);
1950        let mask = _mm_set1_epi8(1);
1951        let r = _mm_test_mix_ones_zeros(a, mask);
1952        assert_eq!(r, 0);
1953        let a = _mm_set1_epi8(-1);
1954        let mask = _mm_set1_epi8(0);
1955        let r = _mm_test_mix_ones_zeros(a, mask);
1956        assert_eq!(r, 0);
1957        let a = _mm_set1_epi8(0b101);
1958        let mask = _mm_set1_epi8(0b110);
1959        let r = _mm_test_mix_ones_zeros(a, mask);
1960        assert_eq!(r, 1);
1961        let a = _mm_set1_epi8(0b101);
1962        let mask = _mm_set1_epi8(0b101);
1963        let r = _mm_test_mix_ones_zeros(a, mask);
1964        assert_eq!(r, 0);
1965    }
1966
1967    #[simd_test(enable = "sse4.1")]
1968    fn test_mm_stream_load_si128() {
1969        let a = _mm_set_epi64x(5, 6);
1970        let r = unsafe { _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _) };
1971        assert_eq_m128i(a, r);
1972    }
1973}