core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    unsafe { pause() }
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub fn _mm_lfence() {
53    unsafe { lfence() }
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub fn _mm_mfence() {
69    unsafe { mfence() }
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
205    //
206    // ```rust
207    // #[target_feature(enable = "sse2")]
208    // unsafe fn widening_add(mad: __m128i) -> __m128i {
209    //     _mm_madd_epi16(mad, _mm_set1_epi16(1))
210    // }
211    // ```
212    //
213    // If we implement this using generic vector intrinsics, the optimizer
214    // will eliminate this pattern, and `pmaddwd` will no longer be emitted.
215    // For this reason, we use x86 intrinsics.
216    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
217}
218
219/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
220/// maximum values.
221///
222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
223#[inline]
224#[target_feature(enable = "sse2")]
225#[cfg_attr(test, assert_instr(pmaxsw))]
226#[stable(feature = "simd_x86", since = "1.27.0")]
227pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
228    unsafe {
229        let a = a.as_i16x8();
230        let b = b.as_i16x8();
231        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
232    }
233}
234
235/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
236/// packed maximum values.
237///
238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
239#[inline]
240#[target_feature(enable = "sse2")]
241#[cfg_attr(test, assert_instr(pmaxub))]
242#[stable(feature = "simd_x86", since = "1.27.0")]
243pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
244    unsafe {
245        let a = a.as_u8x16();
246        let b = b.as_u8x16();
247        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
248    }
249}
250
251/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
252/// minimum values.
253///
254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
255#[inline]
256#[target_feature(enable = "sse2")]
257#[cfg_attr(test, assert_instr(pminsw))]
258#[stable(feature = "simd_x86", since = "1.27.0")]
259pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
260    unsafe {
261        let a = a.as_i16x8();
262        let b = b.as_i16x8();
263        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
264    }
265}
266
267/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
268/// packed minimum values.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
271#[inline]
272#[target_feature(enable = "sse2")]
273#[cfg_attr(test, assert_instr(pminub))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
276    unsafe {
277        let a = a.as_u8x16();
278        let b = b.as_u8x16();
279        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
280    }
281}
282
283/// Multiplies the packed 16-bit integers in `a` and `b`.
284///
285/// The multiplication produces intermediate 32-bit integers, and returns the
286/// high 16 bits of the intermediate integers.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
289#[inline]
290#[target_feature(enable = "sse2")]
291#[cfg_attr(test, assert_instr(pmulhw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
294    unsafe {
295        let a = simd_cast::<_, i32x8>(a.as_i16x8());
296        let b = simd_cast::<_, i32x8>(b.as_i16x8());
297        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
298        transmute(simd_cast::<i32x8, i16x8>(r))
299    }
300}
301
302/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
303///
304/// The multiplication produces intermediate 32-bit integers, and returns the
305/// high 16 bits of the intermediate integers.
306///
307/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
308#[inline]
309#[target_feature(enable = "sse2")]
310#[cfg_attr(test, assert_instr(pmulhuw))]
311#[stable(feature = "simd_x86", since = "1.27.0")]
312pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
313    unsafe {
314        let a = simd_cast::<_, u32x8>(a.as_u16x8());
315        let b = simd_cast::<_, u32x8>(b.as_u16x8());
316        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
317        transmute(simd_cast::<u32x8, u16x8>(r))
318    }
319}
320
321/// Multiplies the packed 16-bit integers in `a` and `b`.
322///
323/// The multiplication produces intermediate 32-bit integers, and returns the
324/// low 16 bits of the intermediate integers.
325///
326/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
327#[inline]
328#[target_feature(enable = "sse2")]
329#[cfg_attr(test, assert_instr(pmullw))]
330#[stable(feature = "simd_x86", since = "1.27.0")]
331pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
332    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
333}
334
335/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
336/// in `a` and `b`.
337///
338/// Returns the unsigned 64-bit results.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
341#[inline]
342#[target_feature(enable = "sse2")]
343#[cfg_attr(test, assert_instr(pmuludq))]
344#[stable(feature = "simd_x86", since = "1.27.0")]
345pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
346    unsafe {
347        let a = a.as_u64x2();
348        let b = b.as_u64x2();
349        let mask = u64x2::splat(u32::MAX.into());
350        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
351    }
352}
353
354/// Sum the absolute differences of packed unsigned 8-bit integers.
355///
356/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
357/// and `b`, then horizontally sum each consecutive 8 differences to produce
358/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
359/// the low 16 bits of 64-bit elements returned.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
362#[inline]
363#[target_feature(enable = "sse2")]
364#[cfg_attr(test, assert_instr(psadbw))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
367    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
368}
369
370/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
373#[inline]
374#[target_feature(enable = "sse2")]
375#[cfg_attr(test, assert_instr(psubb))]
376#[stable(feature = "simd_x86", since = "1.27.0")]
377pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
378    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
379}
380
381/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
382///
383/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
384#[inline]
385#[target_feature(enable = "sse2")]
386#[cfg_attr(test, assert_instr(psubw))]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
389    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
390}
391
392/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
393///
394/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
395#[inline]
396#[target_feature(enable = "sse2")]
397#[cfg_attr(test, assert_instr(psubd))]
398#[stable(feature = "simd_x86", since = "1.27.0")]
399pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
400    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
401}
402
403/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
406#[inline]
407#[target_feature(enable = "sse2")]
408#[cfg_attr(test, assert_instr(psubq))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
412}
413
414/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
415/// using saturation.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
418#[inline]
419#[target_feature(enable = "sse2")]
420#[cfg_attr(test, assert_instr(psubsb))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
424}
425
426/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
427/// using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubsw))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
435    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
436}
437
438/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
439/// integers in `a` using saturation.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
442#[inline]
443#[target_feature(enable = "sse2")]
444#[cfg_attr(test, assert_instr(psubusb))]
445#[stable(feature = "simd_x86", since = "1.27.0")]
446pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
447    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
448}
449
450/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
451/// integers in `a` using saturation.
452///
453/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
454#[inline]
455#[target_feature(enable = "sse2")]
456#[cfg_attr(test, assert_instr(psubusw))]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
459    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
460}
461
462/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
463///
464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
465#[inline]
466#[target_feature(enable = "sse2")]
467#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
468#[rustc_legacy_const_generics(1)]
469#[stable(feature = "simd_x86", since = "1.27.0")]
470pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
471    static_assert_uimm_bits!(IMM8, 8);
472    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
473}
474
475/// Implementation detail: converts the immediate argument of the
476/// `_mm_slli_si128` intrinsic into a compile-time constant.
477#[inline]
478#[target_feature(enable = "sse2")]
479unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
480    const fn mask(shift: i32, i: u32) -> u32 {
481        let shift = shift as u32 & 0xff;
482        if shift > 15 { i } else { 16 - shift + i }
483    }
484    transmute::<i8x16, _>(simd_shuffle!(
485        i8x16::ZERO,
486        a.as_i8x16(),
487        [
488            mask(IMM8, 0),
489            mask(IMM8, 1),
490            mask(IMM8, 2),
491            mask(IMM8, 3),
492            mask(IMM8, 4),
493            mask(IMM8, 5),
494            mask(IMM8, 6),
495            mask(IMM8, 7),
496            mask(IMM8, 8),
497            mask(IMM8, 9),
498            mask(IMM8, 10),
499            mask(IMM8, 11),
500            mask(IMM8, 12),
501            mask(IMM8, 13),
502            mask(IMM8, 14),
503            mask(IMM8, 15),
504        ],
505    ))
506}
507
508/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
509///
510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
511#[inline]
512#[target_feature(enable = "sse2")]
513#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
514#[rustc_legacy_const_generics(1)]
515#[stable(feature = "simd_x86", since = "1.27.0")]
516pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
517    unsafe {
518        static_assert_uimm_bits!(IMM8, 8);
519        _mm_slli_si128_impl::<IMM8>(a)
520    }
521}
522
523/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
526#[inline]
527#[target_feature(enable = "sse2")]
528#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
529#[rustc_legacy_const_generics(1)]
530#[stable(feature = "simd_x86", since = "1.27.0")]
531pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
532    unsafe {
533        static_assert_uimm_bits!(IMM8, 8);
534        _mm_srli_si128_impl::<IMM8>(a)
535    }
536}
537
538/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
541#[inline]
542#[target_feature(enable = "sse2")]
543#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
544#[rustc_legacy_const_generics(1)]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
547    static_assert_uimm_bits!(IMM8, 8);
548    unsafe {
549        if IMM8 >= 16 {
550            _mm_setzero_si128()
551        } else {
552            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
553        }
554    }
555}
556
557/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
558/// zeros.
559///
560/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
561#[inline]
562#[target_feature(enable = "sse2")]
563#[cfg_attr(test, assert_instr(psllw))]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
566    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
567}
568
569/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
570///
571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
572#[inline]
573#[target_feature(enable = "sse2")]
574#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
575#[rustc_legacy_const_generics(1)]
576#[stable(feature = "simd_x86", since = "1.27.0")]
577pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
578    static_assert_uimm_bits!(IMM8, 8);
579    unsafe {
580        if IMM8 >= 32 {
581            _mm_setzero_si128()
582        } else {
583            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
584        }
585    }
586}
587
588/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
589/// zeros.
590///
591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
592#[inline]
593#[target_feature(enable = "sse2")]
594#[cfg_attr(test, assert_instr(pslld))]
595#[stable(feature = "simd_x86", since = "1.27.0")]
596pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
597    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
598}
599
600/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
601///
602/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
603#[inline]
604#[target_feature(enable = "sse2")]
605#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
606#[rustc_legacy_const_generics(1)]
607#[stable(feature = "simd_x86", since = "1.27.0")]
608pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
609    static_assert_uimm_bits!(IMM8, 8);
610    unsafe {
611        if IMM8 >= 64 {
612            _mm_setzero_si128()
613        } else {
614            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
615        }
616    }
617}
618
619/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
620/// zeros.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
623#[inline]
624#[target_feature(enable = "sse2")]
625#[cfg_attr(test, assert_instr(psllq))]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
628    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
629}
630
631/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
632/// bits.
633///
634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
635#[inline]
636#[target_feature(enable = "sse2")]
637#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
638#[rustc_legacy_const_generics(1)]
639#[stable(feature = "simd_x86", since = "1.27.0")]
640pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
641    static_assert_uimm_bits!(IMM8, 8);
642    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
643}
644
645/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
646/// bits.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
649#[inline]
650#[target_feature(enable = "sse2")]
651#[cfg_attr(test, assert_instr(psraw))]
652#[stable(feature = "simd_x86", since = "1.27.0")]
653pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
654    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
655}
656
657/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
658/// bits.
659///
660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
661#[inline]
662#[target_feature(enable = "sse2")]
663#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
664#[rustc_legacy_const_generics(1)]
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
667    static_assert_uimm_bits!(IMM8, 8);
668    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
669}
670
671/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
672/// bits.
673///
674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
675#[inline]
676#[target_feature(enable = "sse2")]
677#[cfg_attr(test, assert_instr(psrad))]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
680    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
681}
682
683/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
684///
685/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
686#[inline]
687#[target_feature(enable = "sse2")]
688#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
689#[rustc_legacy_const_generics(1)]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
692    static_assert_uimm_bits!(IMM8, 8);
693    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
694}
695
696/// Implementation detail: converts the immediate argument of the
697/// `_mm_srli_si128` intrinsic into a compile-time constant.
698#[inline]
699#[target_feature(enable = "sse2")]
700unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
701    const fn mask(shift: i32, i: u32) -> u32 {
702        if (shift as u32) > 15 {
703            i + 16
704        } else {
705            i + (shift as u32)
706        }
707    }
708    let x: i8x16 = simd_shuffle!(
709        a.as_i8x16(),
710        i8x16::ZERO,
711        [
712            mask(IMM8, 0),
713            mask(IMM8, 1),
714            mask(IMM8, 2),
715            mask(IMM8, 3),
716            mask(IMM8, 4),
717            mask(IMM8, 5),
718            mask(IMM8, 6),
719            mask(IMM8, 7),
720            mask(IMM8, 8),
721            mask(IMM8, 9),
722            mask(IMM8, 10),
723            mask(IMM8, 11),
724            mask(IMM8, 12),
725            mask(IMM8, 13),
726            mask(IMM8, 14),
727            mask(IMM8, 15),
728        ],
729    );
730    transmute(x)
731}
732
733/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
734/// zeros.
735///
736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
737#[inline]
738#[target_feature(enable = "sse2")]
739#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
740#[rustc_legacy_const_generics(1)]
741#[stable(feature = "simd_x86", since = "1.27.0")]
742pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
743    static_assert_uimm_bits!(IMM8, 8);
744    unsafe {
745        if IMM8 >= 16 {
746            _mm_setzero_si128()
747        } else {
748            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
749        }
750    }
751}
752
753/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
754/// zeros.
755///
756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
757#[inline]
758#[target_feature(enable = "sse2")]
759#[cfg_attr(test, assert_instr(psrlw))]
760#[stable(feature = "simd_x86", since = "1.27.0")]
761pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
762    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
763}
764
765/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
766/// zeros.
767///
768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
769#[inline]
770#[target_feature(enable = "sse2")]
771#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
772#[rustc_legacy_const_generics(1)]
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
775    static_assert_uimm_bits!(IMM8, 8);
776    unsafe {
777        if IMM8 >= 32 {
778            _mm_setzero_si128()
779        } else {
780            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
781        }
782    }
783}
784
785/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
786/// zeros.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
789#[inline]
790#[target_feature(enable = "sse2")]
791#[cfg_attr(test, assert_instr(psrld))]
792#[stable(feature = "simd_x86", since = "1.27.0")]
793pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
794    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
795}
796
797/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
798/// zeros.
799///
800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
801#[inline]
802#[target_feature(enable = "sse2")]
803#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
804#[rustc_legacy_const_generics(1)]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
807    static_assert_uimm_bits!(IMM8, 8);
808    unsafe {
809        if IMM8 >= 64 {
810            _mm_setzero_si128()
811        } else {
812            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
813        }
814    }
815}
816
817/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
818/// zeros.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
821#[inline]
822#[target_feature(enable = "sse2")]
823#[cfg_attr(test, assert_instr(psrlq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
826    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
827}
828
829/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
830/// `b`.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
833#[inline]
834#[target_feature(enable = "sse2")]
835#[cfg_attr(test, assert_instr(andps))]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
838    unsafe { simd_and(a, b) }
839}
840
841/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
842/// then AND with `b`.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(andnps))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
850    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
851}
852
853/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(orps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
862    unsafe { simd_or(a, b) }
863}
864
865/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
866/// `b`.
867///
868/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
869#[inline]
870#[target_feature(enable = "sse2")]
871#[cfg_attr(test, assert_instr(xorps))]
872#[stable(feature = "simd_x86", since = "1.27.0")]
873pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
874    unsafe { simd_xor(a, b) }
875}
876
877/// Compares packed 8-bit integers in `a` and `b` for equality.
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
880#[inline]
881#[target_feature(enable = "sse2")]
882#[cfg_attr(test, assert_instr(pcmpeqb))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
885    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
886}
887
888/// Compares packed 16-bit integers in `a` and `b` for equality.
889///
890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
891#[inline]
892#[target_feature(enable = "sse2")]
893#[cfg_attr(test, assert_instr(pcmpeqw))]
894#[stable(feature = "simd_x86", since = "1.27.0")]
895pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
896    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
897}
898
899/// Compares packed 32-bit integers in `a` and `b` for equality.
900///
901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
902#[inline]
903#[target_feature(enable = "sse2")]
904#[cfg_attr(test, assert_instr(pcmpeqd))]
905#[stable(feature = "simd_x86", since = "1.27.0")]
906pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
907    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
908}
909
910/// Compares packed 8-bit integers in `a` and `b` for greater-than.
911///
912/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
913#[inline]
914#[target_feature(enable = "sse2")]
915#[cfg_attr(test, assert_instr(pcmpgtb))]
916#[stable(feature = "simd_x86", since = "1.27.0")]
917pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
918    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
919}
920
921/// Compares packed 16-bit integers in `a` and `b` for greater-than.
922///
923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
924#[inline]
925#[target_feature(enable = "sse2")]
926#[cfg_attr(test, assert_instr(pcmpgtw))]
927#[stable(feature = "simd_x86", since = "1.27.0")]
928pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
929    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
930}
931
932/// Compares packed 32-bit integers in `a` and `b` for greater-than.
933///
934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
935#[inline]
936#[target_feature(enable = "sse2")]
937#[cfg_attr(test, assert_instr(pcmpgtd))]
938#[stable(feature = "simd_x86", since = "1.27.0")]
939pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
940    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
941}
942
943/// Compares packed 8-bit integers in `a` and `b` for less-than.
944///
945/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
946#[inline]
947#[target_feature(enable = "sse2")]
948#[cfg_attr(test, assert_instr(pcmpgtb))]
949#[stable(feature = "simd_x86", since = "1.27.0")]
950pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
951    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
952}
953
954/// Compares packed 16-bit integers in `a` and `b` for less-than.
955///
956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
957#[inline]
958#[target_feature(enable = "sse2")]
959#[cfg_attr(test, assert_instr(pcmpgtw))]
960#[stable(feature = "simd_x86", since = "1.27.0")]
961pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
962    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
963}
964
965/// Compares packed 32-bit integers in `a` and `b` for less-than.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(pcmpgtd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
973    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
974}
975
976/// Converts the lower two packed 32-bit integers in `a` to packed
977/// double-precision (64-bit) floating-point elements.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
980#[inline]
981#[target_feature(enable = "sse2")]
982#[cfg_attr(test, assert_instr(cvtdq2pd))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
985    unsafe {
986        let a = a.as_i32x4();
987        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
988    }
989}
990
991/// Returns `a` with its lower element replaced by `b` after converting it to
992/// an `f64`.
993///
994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
995#[inline]
996#[target_feature(enable = "sse2")]
997#[cfg_attr(test, assert_instr(cvtsi2sd))]
998#[stable(feature = "simd_x86", since = "1.27.0")]
999pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
1000    unsafe { simd_insert!(a, 0, b as f64) }
1001}
1002
1003/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
1004/// floating-point elements.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
1007#[inline]
1008#[target_feature(enable = "sse2")]
1009#[cfg_attr(test, assert_instr(cvtdq2ps))]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1012    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1013}
1014
1015/// Converts packed single-precision (32-bit) floating-point elements in `a`
1016/// to packed 32-bit integers.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021#[cfg_attr(test, assert_instr(cvtps2dq))]
1022#[stable(feature = "simd_x86", since = "1.27.0")]
1023pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1024    unsafe { transmute(cvtps2dq(a)) }
1025}
1026
1027/// Returns a vector whose lowest element is `a` and all higher elements are
1028/// `0`.
1029///
1030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1031#[inline]
1032#[target_feature(enable = "sse2")]
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1035    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1036}
1037
1038/// Returns the lowest element of `a`.
1039///
1040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1041#[inline]
1042#[target_feature(enable = "sse2")]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1045    unsafe { simd_extract!(a.as_i32x4(), 0) }
1046}
1047
1048/// Sets packed 64-bit integers with the supplied values, from highest to
1049/// lowest.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1052#[inline]
1053#[target_feature(enable = "sse2")]
1054// no particular instruction to test
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1057    unsafe { transmute(i64x2::new(e0, e1)) }
1058}
1059
1060/// Sets packed 32-bit integers with the supplied values.
1061///
1062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1063#[inline]
1064#[target_feature(enable = "sse2")]
1065// no particular instruction to test
1066#[stable(feature = "simd_x86", since = "1.27.0")]
1067pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1068    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1069}
1070
1071/// Sets packed 16-bit integers with the supplied values.
1072///
1073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1074#[inline]
1075#[target_feature(enable = "sse2")]
1076// no particular instruction to test
1077#[stable(feature = "simd_x86", since = "1.27.0")]
1078pub fn _mm_set_epi16(
1079    e7: i16,
1080    e6: i16,
1081    e5: i16,
1082    e4: i16,
1083    e3: i16,
1084    e2: i16,
1085    e1: i16,
1086    e0: i16,
1087) -> __m128i {
1088    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1089}
1090
1091/// Sets packed 8-bit integers with the supplied values.
1092///
1093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1094#[inline]
1095#[target_feature(enable = "sse2")]
1096// no particular instruction to test
1097#[stable(feature = "simd_x86", since = "1.27.0")]
1098pub fn _mm_set_epi8(
1099    e15: i8,
1100    e14: i8,
1101    e13: i8,
1102    e12: i8,
1103    e11: i8,
1104    e10: i8,
1105    e9: i8,
1106    e8: i8,
1107    e7: i8,
1108    e6: i8,
1109    e5: i8,
1110    e4: i8,
1111    e3: i8,
1112    e2: i8,
1113    e1: i8,
1114    e0: i8,
1115) -> __m128i {
1116    unsafe {
1117        #[rustfmt::skip]
1118        transmute(i8x16::new(
1119            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1120        ))
1121    }
1122}
1123
1124/// Broadcasts 64-bit integer `a` to all elements.
1125///
1126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1127#[inline]
1128#[target_feature(enable = "sse2")]
1129// no particular instruction to test
1130#[stable(feature = "simd_x86", since = "1.27.0")]
1131pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1132    _mm_set_epi64x(a, a)
1133}
1134
1135/// Broadcasts 32-bit integer `a` to all elements.
1136///
1137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1138#[inline]
1139#[target_feature(enable = "sse2")]
1140// no particular instruction to test
1141#[stable(feature = "simd_x86", since = "1.27.0")]
1142pub fn _mm_set1_epi32(a: i32) -> __m128i {
1143    _mm_set_epi32(a, a, a, a)
1144}
1145
1146/// Broadcasts 16-bit integer `a` to all elements.
1147///
1148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1149#[inline]
1150#[target_feature(enable = "sse2")]
1151// no particular instruction to test
1152#[stable(feature = "simd_x86", since = "1.27.0")]
1153pub fn _mm_set1_epi16(a: i16) -> __m128i {
1154    _mm_set_epi16(a, a, a, a, a, a, a, a)
1155}
1156
1157/// Broadcasts 8-bit integer `a` to all elements.
1158///
1159/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1160#[inline]
1161#[target_feature(enable = "sse2")]
1162// no particular instruction to test
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub fn _mm_set1_epi8(a: i8) -> __m128i {
1165    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1166}
1167
1168/// Sets packed 32-bit integers with the supplied values in reverse order.
1169///
1170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1171#[inline]
1172#[target_feature(enable = "sse2")]
1173// no particular instruction to test
1174#[stable(feature = "simd_x86", since = "1.27.0")]
1175pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1176    _mm_set_epi32(e0, e1, e2, e3)
1177}
1178
1179/// Sets packed 16-bit integers with the supplied values in reverse order.
1180///
1181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1182#[inline]
1183#[target_feature(enable = "sse2")]
1184// no particular instruction to test
1185#[stable(feature = "simd_x86", since = "1.27.0")]
1186pub fn _mm_setr_epi16(
1187    e7: i16,
1188    e6: i16,
1189    e5: i16,
1190    e4: i16,
1191    e3: i16,
1192    e2: i16,
1193    e1: i16,
1194    e0: i16,
1195) -> __m128i {
1196    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1197}
1198
1199/// Sets packed 8-bit integers with the supplied values in reverse order.
1200///
1201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1202#[inline]
1203#[target_feature(enable = "sse2")]
1204// no particular instruction to test
1205#[stable(feature = "simd_x86", since = "1.27.0")]
1206pub fn _mm_setr_epi8(
1207    e15: i8,
1208    e14: i8,
1209    e13: i8,
1210    e12: i8,
1211    e11: i8,
1212    e10: i8,
1213    e9: i8,
1214    e8: i8,
1215    e7: i8,
1216    e6: i8,
1217    e5: i8,
1218    e4: i8,
1219    e3: i8,
1220    e2: i8,
1221    e1: i8,
1222    e0: i8,
1223) -> __m128i {
1224    #[rustfmt::skip]
1225    _mm_set_epi8(
1226        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1227    )
1228}
1229
1230/// Returns a vector with all elements set to zero.
1231///
1232/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1233#[inline]
1234#[target_feature(enable = "sse2")]
1235#[cfg_attr(test, assert_instr(xorps))]
1236#[stable(feature = "simd_x86", since = "1.27.0")]
1237pub fn _mm_setzero_si128() -> __m128i {
1238    const { unsafe { mem::zeroed() } }
1239}
1240
1241/// Loads 64-bit integer from memory into first element of returned vector.
1242///
1243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1244#[inline]
1245#[target_feature(enable = "sse2")]
1246#[stable(feature = "simd_x86", since = "1.27.0")]
1247pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1248    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1249}
1250
1251/// Loads 128-bits of integer data from memory into a new vector.
1252///
1253/// `mem_addr` must be aligned on a 16-byte boundary.
1254///
1255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1256#[inline]
1257#[target_feature(enable = "sse2")]
1258#[cfg_attr(
1259    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1260    assert_instr(movaps)
1261)]
1262#[stable(feature = "simd_x86", since = "1.27.0")]
1263pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1264    *mem_addr
1265}
1266
1267/// Loads 128-bits of integer data from memory into a new vector.
1268///
1269/// `mem_addr` does not need to be aligned on any particular boundary.
1270///
1271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1272#[inline]
1273#[target_feature(enable = "sse2")]
1274#[cfg_attr(test, assert_instr(movups))]
1275#[stable(feature = "simd_x86", since = "1.27.0")]
1276pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1277    let mut dst: __m128i = _mm_undefined_si128();
1278    ptr::copy_nonoverlapping(
1279        mem_addr as *const u8,
1280        ptr::addr_of_mut!(dst) as *mut u8,
1281        mem::size_of::<__m128i>(),
1282    );
1283    dst
1284}
1285
1286/// Conditionally store 8-bit integer elements from `a` into memory using
1287/// `mask` flagged as non-temporal (unlikely to be used again soon).
1288///
1289/// Elements are not stored when the highest bit is not set in the
1290/// corresponding element.
1291///
1292/// `mem_addr` should correspond to a 128-bit memory location and does not need
1293/// to be aligned on any particular boundary.
1294///
1295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1296///
1297/// # Safety of non-temporal stores
1298///
1299/// After using this intrinsic, but before any other access to the memory that this intrinsic
1300/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1301/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1302/// return.
1303///
1304/// See [`_mm_sfence`] for details.
1305#[inline]
1306#[target_feature(enable = "sse2")]
1307#[cfg_attr(test, assert_instr(maskmovdqu))]
1308#[stable(feature = "simd_x86", since = "1.27.0")]
1309pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1310    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1311}
1312
1313/// Stores 128-bits of integer data from `a` into memory.
1314///
1315/// `mem_addr` must be aligned on a 16-byte boundary.
1316///
1317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1318#[inline]
1319#[target_feature(enable = "sse2")]
1320#[cfg_attr(
1321    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1322    assert_instr(movaps)
1323)]
1324#[stable(feature = "simd_x86", since = "1.27.0")]
1325pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1326    *mem_addr = a;
1327}
1328
1329/// Stores 128-bits of integer data from `a` into memory.
1330///
1331/// `mem_addr` does not need to be aligned on any particular boundary.
1332///
1333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1334#[inline]
1335#[target_feature(enable = "sse2")]
1336#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1337#[stable(feature = "simd_x86", since = "1.27.0")]
1338pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1339    mem_addr.write_unaligned(a);
1340}
1341
1342/// Stores the lower 64-bit integer `a` to a memory location.
1343///
1344/// `mem_addr` does not need to be aligned on any particular boundary.
1345///
1346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1347#[inline]
1348#[target_feature(enable = "sse2")]
1349#[stable(feature = "simd_x86", since = "1.27.0")]
1350pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1351    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1352}
1353
1354/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1355/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1356/// used again soon).
1357///
1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1359///
1360/// # Safety of non-temporal stores
1361///
1362/// After using this intrinsic, but before any other access to the memory that this intrinsic
1363/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1364/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1365/// return.
1366///
1367/// See [`_mm_sfence`] for details.
1368#[inline]
1369#[target_feature(enable = "sse2")]
1370#[cfg_attr(test, assert_instr(movntdq))]
1371#[stable(feature = "simd_x86", since = "1.27.0")]
1372pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1373    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1374    crate::arch::asm!(
1375        vps!("movntdq",  ",{a}"),
1376        p = in(reg) mem_addr,
1377        a = in(xmm_reg) a,
1378        options(nostack, preserves_flags),
1379    );
1380}
1381
1382/// Stores a 32-bit integer value in the specified memory location.
1383/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1384/// used again soon).
1385///
1386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1387///
1388/// # Safety of non-temporal stores
1389///
1390/// After using this intrinsic, but before any other access to the memory that this intrinsic
1391/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1392/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1393/// return.
1394///
1395/// See [`_mm_sfence`] for details.
1396#[inline]
1397#[target_feature(enable = "sse2")]
1398#[cfg_attr(test, assert_instr(movnti))]
1399#[stable(feature = "simd_x86", since = "1.27.0")]
1400pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1401    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1402    crate::arch::asm!(
1403        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1404        p = in(reg) mem_addr,
1405        a = in(reg) a,
1406        options(nostack, preserves_flags),
1407    );
1408}
1409
1410/// Returns a vector where the low element is extracted from `a` and its upper
1411/// element is zero.
1412///
1413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1414#[inline]
1415#[target_feature(enable = "sse2")]
1416// FIXME movd on msvc, movd on i686
1417#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1418#[stable(feature = "simd_x86", since = "1.27.0")]
1419pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1420    unsafe {
1421        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1422        transmute(r)
1423    }
1424}
1425
1426/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1427/// using signed saturation.
1428///
1429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1430#[inline]
1431#[target_feature(enable = "sse2")]
1432#[cfg_attr(test, assert_instr(packsswb))]
1433#[stable(feature = "simd_x86", since = "1.27.0")]
1434pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1435    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1436}
1437
1438/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1439/// using signed saturation.
1440///
1441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1442#[inline]
1443#[target_feature(enable = "sse2")]
1444#[cfg_attr(test, assert_instr(packssdw))]
1445#[stable(feature = "simd_x86", since = "1.27.0")]
1446pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1447    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1448}
1449
1450/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1451/// using unsigned saturation.
1452///
1453/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1454#[inline]
1455#[target_feature(enable = "sse2")]
1456#[cfg_attr(test, assert_instr(packuswb))]
1457#[stable(feature = "simd_x86", since = "1.27.0")]
1458pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1459    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1460}
1461
1462/// Returns the `imm8` element of `a`.
1463///
1464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1465#[inline]
1466#[target_feature(enable = "sse2")]
1467#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1468#[rustc_legacy_const_generics(1)]
1469#[stable(feature = "simd_x86", since = "1.27.0")]
1470pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1471    static_assert_uimm_bits!(IMM8, 3);
1472    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1473}
1474
1475/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1476///
1477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1478#[inline]
1479#[target_feature(enable = "sse2")]
1480#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1481#[rustc_legacy_const_generics(2)]
1482#[stable(feature = "simd_x86", since = "1.27.0")]
1483pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1484    static_assert_uimm_bits!(IMM8, 3);
1485    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1486}
1487
1488/// Returns a mask of the most significant bit of each element in `a`.
1489///
1490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1491#[inline]
1492#[target_feature(enable = "sse2")]
1493#[cfg_attr(test, assert_instr(pmovmskb))]
1494#[stable(feature = "simd_x86", since = "1.27.0")]
1495pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1496    unsafe {
1497        let z = i8x16::ZERO;
1498        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1499        simd_bitmask::<_, u16>(m) as u32 as i32
1500    }
1501}
1502
1503/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1504///
1505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1506#[inline]
1507#[target_feature(enable = "sse2")]
1508#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1509#[rustc_legacy_const_generics(1)]
1510#[stable(feature = "simd_x86", since = "1.27.0")]
1511pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1512    static_assert_uimm_bits!(IMM8, 8);
1513    unsafe {
1514        let a = a.as_i32x4();
1515        let x: i32x4 = simd_shuffle!(
1516            a,
1517            a,
1518            [
1519                IMM8 as u32 & 0b11,
1520                (IMM8 as u32 >> 2) & 0b11,
1521                (IMM8 as u32 >> 4) & 0b11,
1522                (IMM8 as u32 >> 6) & 0b11,
1523            ],
1524        );
1525        transmute(x)
1526    }
1527}
1528
1529/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1530/// `IMM8`.
1531///
1532/// Put the results in the high 64 bits of the returned vector, with the low 64
1533/// bits being copied from `a`.
1534///
1535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1536#[inline]
1537#[target_feature(enable = "sse2")]
1538#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1539#[rustc_legacy_const_generics(1)]
1540#[stable(feature = "simd_x86", since = "1.27.0")]
1541pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1542    static_assert_uimm_bits!(IMM8, 8);
1543    unsafe {
1544        let a = a.as_i16x8();
1545        let x: i16x8 = simd_shuffle!(
1546            a,
1547            a,
1548            [
1549                0,
1550                1,
1551                2,
1552                3,
1553                (IMM8 as u32 & 0b11) + 4,
1554                ((IMM8 as u32 >> 2) & 0b11) + 4,
1555                ((IMM8 as u32 >> 4) & 0b11) + 4,
1556                ((IMM8 as u32 >> 6) & 0b11) + 4,
1557            ],
1558        );
1559        transmute(x)
1560    }
1561}
1562
1563/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1564/// `IMM8`.
1565///
1566/// Put the results in the low 64 bits of the returned vector, with the high 64
1567/// bits being copied from `a`.
1568///
1569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1570#[inline]
1571#[target_feature(enable = "sse2")]
1572#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1573#[rustc_legacy_const_generics(1)]
1574#[stable(feature = "simd_x86", since = "1.27.0")]
1575pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1576    static_assert_uimm_bits!(IMM8, 8);
1577    unsafe {
1578        let a = a.as_i16x8();
1579        let x: i16x8 = simd_shuffle!(
1580            a,
1581            a,
1582            [
1583                IMM8 as u32 & 0b11,
1584                (IMM8 as u32 >> 2) & 0b11,
1585                (IMM8 as u32 >> 4) & 0b11,
1586                (IMM8 as u32 >> 6) & 0b11,
1587                4,
1588                5,
1589                6,
1590                7,
1591            ],
1592        );
1593        transmute(x)
1594    }
1595}
1596
1597/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1598///
1599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1600#[inline]
1601#[target_feature(enable = "sse2")]
1602#[cfg_attr(test, assert_instr(punpckhbw))]
1603#[stable(feature = "simd_x86", since = "1.27.0")]
1604pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1605    unsafe {
1606        transmute::<i8x16, _>(simd_shuffle!(
1607            a.as_i8x16(),
1608            b.as_i8x16(),
1609            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1610        ))
1611    }
1612}
1613
1614/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1615///
1616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1617#[inline]
1618#[target_feature(enable = "sse2")]
1619#[cfg_attr(test, assert_instr(punpckhwd))]
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1622    unsafe {
1623        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1624        transmute::<i16x8, _>(x)
1625    }
1626}
1627
1628/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1631#[inline]
1632#[target_feature(enable = "sse2")]
1633#[cfg_attr(test, assert_instr(unpckhps))]
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1636    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1637}
1638
1639/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1640///
1641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1642#[inline]
1643#[target_feature(enable = "sse2")]
1644#[cfg_attr(test, assert_instr(unpckhpd))]
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1647    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1648}
1649
1650/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1651///
1652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1653#[inline]
1654#[target_feature(enable = "sse2")]
1655#[cfg_attr(test, assert_instr(punpcklbw))]
1656#[stable(feature = "simd_x86", since = "1.27.0")]
1657pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1658    unsafe {
1659        transmute::<i8x16, _>(simd_shuffle!(
1660            a.as_i8x16(),
1661            b.as_i8x16(),
1662            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1663        ))
1664    }
1665}
1666
1667/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1668///
1669/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1670#[inline]
1671#[target_feature(enable = "sse2")]
1672#[cfg_attr(test, assert_instr(punpcklwd))]
1673#[stable(feature = "simd_x86", since = "1.27.0")]
1674pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1675    unsafe {
1676        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1677        transmute::<i16x8, _>(x)
1678    }
1679}
1680
1681/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1682///
1683/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1684#[inline]
1685#[target_feature(enable = "sse2")]
1686#[cfg_attr(test, assert_instr(unpcklps))]
1687#[stable(feature = "simd_x86", since = "1.27.0")]
1688pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1689    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1690}
1691
1692/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1693///
1694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1695#[inline]
1696#[target_feature(enable = "sse2")]
1697#[cfg_attr(test, assert_instr(movlhps))]
1698#[stable(feature = "simd_x86", since = "1.27.0")]
1699pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1700    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1701}
1702
1703/// Returns a new vector with the low element of `a` replaced by the sum of the
1704/// low elements of `a` and `b`.
1705///
1706/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1707#[inline]
1708#[target_feature(enable = "sse2")]
1709#[cfg_attr(test, assert_instr(addsd))]
1710#[stable(feature = "simd_x86", since = "1.27.0")]
1711pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1712    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1713}
1714
1715/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1716/// `b`.
1717///
1718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1719#[inline]
1720#[target_feature(enable = "sse2")]
1721#[cfg_attr(test, assert_instr(addpd))]
1722#[stable(feature = "simd_x86", since = "1.27.0")]
1723pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1724    unsafe { simd_add(a, b) }
1725}
1726
1727/// Returns a new vector with the low element of `a` replaced by the result of
1728/// diving the lower element of `a` by the lower element of `b`.
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1731#[inline]
1732#[target_feature(enable = "sse2")]
1733#[cfg_attr(test, assert_instr(divsd))]
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1736    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1737}
1738
1739/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1740/// packed elements in `b`.
1741///
1742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1743#[inline]
1744#[target_feature(enable = "sse2")]
1745#[cfg_attr(test, assert_instr(divpd))]
1746#[stable(feature = "simd_x86", since = "1.27.0")]
1747pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1748    unsafe { simd_div(a, b) }
1749}
1750
1751/// Returns a new vector with the low element of `a` replaced by the maximum
1752/// of the lower elements of `a` and `b`.
1753///
1754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1755#[inline]
1756#[target_feature(enable = "sse2")]
1757#[cfg_attr(test, assert_instr(maxsd))]
1758#[stable(feature = "simd_x86", since = "1.27.0")]
1759pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1760    unsafe { maxsd(a, b) }
1761}
1762
1763/// Returns a new vector with the maximum values from corresponding elements in
1764/// `a` and `b`.
1765///
1766/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1767#[inline]
1768#[target_feature(enable = "sse2")]
1769#[cfg_attr(test, assert_instr(maxpd))]
1770#[stable(feature = "simd_x86", since = "1.27.0")]
1771pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1772    unsafe { maxpd(a, b) }
1773}
1774
1775/// Returns a new vector with the low element of `a` replaced by the minimum
1776/// of the lower elements of `a` and `b`.
1777///
1778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1779#[inline]
1780#[target_feature(enable = "sse2")]
1781#[cfg_attr(test, assert_instr(minsd))]
1782#[stable(feature = "simd_x86", since = "1.27.0")]
1783pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1784    unsafe { minsd(a, b) }
1785}
1786
1787/// Returns a new vector with the minimum values from corresponding elements in
1788/// `a` and `b`.
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1791#[inline]
1792#[target_feature(enable = "sse2")]
1793#[cfg_attr(test, assert_instr(minpd))]
1794#[stable(feature = "simd_x86", since = "1.27.0")]
1795pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1796    unsafe { minpd(a, b) }
1797}
1798
1799/// Returns a new vector with the low element of `a` replaced by multiplying the
1800/// low elements of `a` and `b`.
1801///
1802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1803#[inline]
1804#[target_feature(enable = "sse2")]
1805#[cfg_attr(test, assert_instr(mulsd))]
1806#[stable(feature = "simd_x86", since = "1.27.0")]
1807pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1808    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1809}
1810
1811/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1812/// and `b`.
1813///
1814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1815#[inline]
1816#[target_feature(enable = "sse2")]
1817#[cfg_attr(test, assert_instr(mulpd))]
1818#[stable(feature = "simd_x86", since = "1.27.0")]
1819pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1820    unsafe { simd_mul(a, b) }
1821}
1822
1823/// Returns a new vector with the low element of `a` replaced by the square
1824/// root of the lower element `b`.
1825///
1826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1827#[inline]
1828#[target_feature(enable = "sse2")]
1829#[cfg_attr(test, assert_instr(sqrtsd))]
1830#[stable(feature = "simd_x86", since = "1.27.0")]
1831pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1832    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1833}
1834
1835/// Returns a new vector with the square root of each of the values in `a`.
1836///
1837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1838#[inline]
1839#[target_feature(enable = "sse2")]
1840#[cfg_attr(test, assert_instr(sqrtpd))]
1841#[stable(feature = "simd_x86", since = "1.27.0")]
1842pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1843    unsafe { simd_fsqrt(a) }
1844}
1845
1846/// Returns a new vector with the low element of `a` replaced by subtracting the
1847/// low element by `b` from the low element of `a`.
1848///
1849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1850#[inline]
1851#[target_feature(enable = "sse2")]
1852#[cfg_attr(test, assert_instr(subsd))]
1853#[stable(feature = "simd_x86", since = "1.27.0")]
1854pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1855    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1856}
1857
1858/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1859/// from `a`.
1860///
1861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1862#[inline]
1863#[target_feature(enable = "sse2")]
1864#[cfg_attr(test, assert_instr(subpd))]
1865#[stable(feature = "simd_x86", since = "1.27.0")]
1866pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1867    unsafe { simd_sub(a, b) }
1868}
1869
1870/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1871/// elements in `a` and `b`.
1872///
1873/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1874#[inline]
1875#[target_feature(enable = "sse2")]
1876#[cfg_attr(test, assert_instr(andps))]
1877#[stable(feature = "simd_x86", since = "1.27.0")]
1878pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1879    unsafe {
1880        let a: __m128i = transmute(a);
1881        let b: __m128i = transmute(b);
1882        transmute(_mm_and_si128(a, b))
1883    }
1884}
1885
1886/// Computes the bitwise NOT of `a` and then AND with `b`.
1887///
1888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1889#[inline]
1890#[target_feature(enable = "sse2")]
1891#[cfg_attr(test, assert_instr(andnps))]
1892#[stable(feature = "simd_x86", since = "1.27.0")]
1893pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1894    unsafe {
1895        let a: __m128i = transmute(a);
1896        let b: __m128i = transmute(b);
1897        transmute(_mm_andnot_si128(a, b))
1898    }
1899}
1900
1901/// Computes the bitwise OR of `a` and `b`.
1902///
1903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1904#[inline]
1905#[target_feature(enable = "sse2")]
1906#[cfg_attr(test, assert_instr(orps))]
1907#[stable(feature = "simd_x86", since = "1.27.0")]
1908pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1909    unsafe {
1910        let a: __m128i = transmute(a);
1911        let b: __m128i = transmute(b);
1912        transmute(_mm_or_si128(a, b))
1913    }
1914}
1915
1916/// Computes the bitwise XOR of `a` and `b`.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1919#[inline]
1920#[target_feature(enable = "sse2")]
1921#[cfg_attr(test, assert_instr(xorps))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1924    unsafe {
1925        let a: __m128i = transmute(a);
1926        let b: __m128i = transmute(b);
1927        transmute(_mm_xor_si128(a, b))
1928    }
1929}
1930
1931/// Returns a new vector with the low element of `a` replaced by the equality
1932/// comparison of the lower elements of `a` and `b`.
1933///
1934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1935#[inline]
1936#[target_feature(enable = "sse2")]
1937#[cfg_attr(test, assert_instr(cmpeqsd))]
1938#[stable(feature = "simd_x86", since = "1.27.0")]
1939pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1940    unsafe { cmpsd(a, b, 0) }
1941}
1942
1943/// Returns a new vector with the low element of `a` replaced by the less-than
1944/// comparison of the lower elements of `a` and `b`.
1945///
1946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1947#[inline]
1948#[target_feature(enable = "sse2")]
1949#[cfg_attr(test, assert_instr(cmpltsd))]
1950#[stable(feature = "simd_x86", since = "1.27.0")]
1951pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1952    unsafe { cmpsd(a, b, 1) }
1953}
1954
1955/// Returns a new vector with the low element of `a` replaced by the
1956/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1957///
1958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1959#[inline]
1960#[target_feature(enable = "sse2")]
1961#[cfg_attr(test, assert_instr(cmplesd))]
1962#[stable(feature = "simd_x86", since = "1.27.0")]
1963pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1964    unsafe { cmpsd(a, b, 2) }
1965}
1966
1967/// Returns a new vector with the low element of `a` replaced by the
1968/// greater-than comparison of the lower elements of `a` and `b`.
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1971#[inline]
1972#[target_feature(enable = "sse2")]
1973#[cfg_attr(test, assert_instr(cmpltsd))]
1974#[stable(feature = "simd_x86", since = "1.27.0")]
1975pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1976    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1977}
1978
1979/// Returns a new vector with the low element of `a` replaced by the
1980/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1983#[inline]
1984#[target_feature(enable = "sse2")]
1985#[cfg_attr(test, assert_instr(cmplesd))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1988    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1989}
1990
1991/// Returns a new vector with the low element of `a` replaced by the result
1992/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1993/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1994/// otherwise.
1995///
1996/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1997#[inline]
1998#[target_feature(enable = "sse2")]
1999#[cfg_attr(test, assert_instr(cmpordsd))]
2000#[stable(feature = "simd_x86", since = "1.27.0")]
2001pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
2002    unsafe { cmpsd(a, b, 7) }
2003}
2004
2005/// Returns a new vector with the low element of `a` replaced by the result of
2006/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
2007/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
2008///
2009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
2010#[inline]
2011#[target_feature(enable = "sse2")]
2012#[cfg_attr(test, assert_instr(cmpunordsd))]
2013#[stable(feature = "simd_x86", since = "1.27.0")]
2014pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
2015    unsafe { cmpsd(a, b, 3) }
2016}
2017
2018/// Returns a new vector with the low element of `a` replaced by the not-equal
2019/// comparison of the lower elements of `a` and `b`.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
2022#[inline]
2023#[target_feature(enable = "sse2")]
2024#[cfg_attr(test, assert_instr(cmpneqsd))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2027    unsafe { cmpsd(a, b, 4) }
2028}
2029
2030/// Returns a new vector with the low element of `a` replaced by the
2031/// not-less-than comparison of the lower elements of `a` and `b`.
2032///
2033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2034#[inline]
2035#[target_feature(enable = "sse2")]
2036#[cfg_attr(test, assert_instr(cmpnltsd))]
2037#[stable(feature = "simd_x86", since = "1.27.0")]
2038pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2039    unsafe { cmpsd(a, b, 5) }
2040}
2041
2042/// Returns a new vector with the low element of `a` replaced by the
2043/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2046#[inline]
2047#[target_feature(enable = "sse2")]
2048#[cfg_attr(test, assert_instr(cmpnlesd))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2051    unsafe { cmpsd(a, b, 6) }
2052}
2053
2054/// Returns a new vector with the low element of `a` replaced by the
2055/// not-greater-than comparison of the lower elements of `a` and `b`.
2056///
2057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2058#[inline]
2059#[target_feature(enable = "sse2")]
2060#[cfg_attr(test, assert_instr(cmpnltsd))]
2061#[stable(feature = "simd_x86", since = "1.27.0")]
2062pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2064}
2065
2066/// Returns a new vector with the low element of `a` replaced by the
2067/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2068///
2069/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2070#[inline]
2071#[target_feature(enable = "sse2")]
2072#[cfg_attr(test, assert_instr(cmpnlesd))]
2073#[stable(feature = "simd_x86", since = "1.27.0")]
2074pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2075    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2076}
2077
2078/// Compares corresponding elements in `a` and `b` for equality.
2079///
2080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2081#[inline]
2082#[target_feature(enable = "sse2")]
2083#[cfg_attr(test, assert_instr(cmpeqpd))]
2084#[stable(feature = "simd_x86", since = "1.27.0")]
2085pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2086    unsafe { cmppd(a, b, 0) }
2087}
2088
2089/// Compares corresponding elements in `a` and `b` for less-than.
2090///
2091/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2092#[inline]
2093#[target_feature(enable = "sse2")]
2094#[cfg_attr(test, assert_instr(cmpltpd))]
2095#[stable(feature = "simd_x86", since = "1.27.0")]
2096pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2097    unsafe { cmppd(a, b, 1) }
2098}
2099
2100/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2101///
2102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2103#[inline]
2104#[target_feature(enable = "sse2")]
2105#[cfg_attr(test, assert_instr(cmplepd))]
2106#[stable(feature = "simd_x86", since = "1.27.0")]
2107pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2108    unsafe { cmppd(a, b, 2) }
2109}
2110
2111/// Compares corresponding elements in `a` and `b` for greater-than.
2112///
2113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2114#[inline]
2115#[target_feature(enable = "sse2")]
2116#[cfg_attr(test, assert_instr(cmpltpd))]
2117#[stable(feature = "simd_x86", since = "1.27.0")]
2118pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2119    _mm_cmplt_pd(b, a)
2120}
2121
2122/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2123///
2124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2125#[inline]
2126#[target_feature(enable = "sse2")]
2127#[cfg_attr(test, assert_instr(cmplepd))]
2128#[stable(feature = "simd_x86", since = "1.27.0")]
2129pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2130    _mm_cmple_pd(b, a)
2131}
2132
2133/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2134///
2135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2136#[inline]
2137#[target_feature(enable = "sse2")]
2138#[cfg_attr(test, assert_instr(cmpordpd))]
2139#[stable(feature = "simd_x86", since = "1.27.0")]
2140pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2141    unsafe { cmppd(a, b, 7) }
2142}
2143
2144/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2145///
2146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2147#[inline]
2148#[target_feature(enable = "sse2")]
2149#[cfg_attr(test, assert_instr(cmpunordpd))]
2150#[stable(feature = "simd_x86", since = "1.27.0")]
2151pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2152    unsafe { cmppd(a, b, 3) }
2153}
2154
2155/// Compares corresponding elements in `a` and `b` for not-equal.
2156///
2157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2158#[inline]
2159#[target_feature(enable = "sse2")]
2160#[cfg_attr(test, assert_instr(cmpneqpd))]
2161#[stable(feature = "simd_x86", since = "1.27.0")]
2162pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2163    unsafe { cmppd(a, b, 4) }
2164}
2165
2166/// Compares corresponding elements in `a` and `b` for not-less-than.
2167///
2168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2169#[inline]
2170#[target_feature(enable = "sse2")]
2171#[cfg_attr(test, assert_instr(cmpnltpd))]
2172#[stable(feature = "simd_x86", since = "1.27.0")]
2173pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2174    unsafe { cmppd(a, b, 5) }
2175}
2176
2177/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2178///
2179/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2180#[inline]
2181#[target_feature(enable = "sse2")]
2182#[cfg_attr(test, assert_instr(cmpnlepd))]
2183#[stable(feature = "simd_x86", since = "1.27.0")]
2184pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2185    unsafe { cmppd(a, b, 6) }
2186}
2187
2188/// Compares corresponding elements in `a` and `b` for not-greater-than.
2189///
2190/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2191#[inline]
2192#[target_feature(enable = "sse2")]
2193#[cfg_attr(test, assert_instr(cmpnltpd))]
2194#[stable(feature = "simd_x86", since = "1.27.0")]
2195pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2196    _mm_cmpnlt_pd(b, a)
2197}
2198
2199/// Compares corresponding elements in `a` and `b` for
2200/// not-greater-than-or-equal.
2201///
2202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2203#[inline]
2204#[target_feature(enable = "sse2")]
2205#[cfg_attr(test, assert_instr(cmpnlepd))]
2206#[stable(feature = "simd_x86", since = "1.27.0")]
2207pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2208    _mm_cmpnle_pd(b, a)
2209}
2210
2211/// Compares the lower element of `a` and `b` for equality.
2212///
2213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2214#[inline]
2215#[target_feature(enable = "sse2")]
2216#[cfg_attr(test, assert_instr(comisd))]
2217#[stable(feature = "simd_x86", since = "1.27.0")]
2218pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2219    unsafe { comieqsd(a, b) }
2220}
2221
2222/// Compares the lower element of `a` and `b` for less-than.
2223///
2224/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2225#[inline]
2226#[target_feature(enable = "sse2")]
2227#[cfg_attr(test, assert_instr(comisd))]
2228#[stable(feature = "simd_x86", since = "1.27.0")]
2229pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2230    unsafe { comiltsd(a, b) }
2231}
2232
2233/// Compares the lower element of `a` and `b` for less-than-or-equal.
2234///
2235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2236#[inline]
2237#[target_feature(enable = "sse2")]
2238#[cfg_attr(test, assert_instr(comisd))]
2239#[stable(feature = "simd_x86", since = "1.27.0")]
2240pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2241    unsafe { comilesd(a, b) }
2242}
2243
2244/// Compares the lower element of `a` and `b` for greater-than.
2245///
2246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2247#[inline]
2248#[target_feature(enable = "sse2")]
2249#[cfg_attr(test, assert_instr(comisd))]
2250#[stable(feature = "simd_x86", since = "1.27.0")]
2251pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2252    unsafe { comigtsd(a, b) }
2253}
2254
2255/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2256///
2257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2258#[inline]
2259#[target_feature(enable = "sse2")]
2260#[cfg_attr(test, assert_instr(comisd))]
2261#[stable(feature = "simd_x86", since = "1.27.0")]
2262pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2263    unsafe { comigesd(a, b) }
2264}
2265
2266/// Compares the lower element of `a` and `b` for not-equal.
2267///
2268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2269#[inline]
2270#[target_feature(enable = "sse2")]
2271#[cfg_attr(test, assert_instr(comisd))]
2272#[stable(feature = "simd_x86", since = "1.27.0")]
2273pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2274    unsafe { comineqsd(a, b) }
2275}
2276
2277/// Compares the lower element of `a` and `b` for equality.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2280#[inline]
2281#[target_feature(enable = "sse2")]
2282#[cfg_attr(test, assert_instr(ucomisd))]
2283#[stable(feature = "simd_x86", since = "1.27.0")]
2284pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2285    unsafe { ucomieqsd(a, b) }
2286}
2287
2288/// Compares the lower element of `a` and `b` for less-than.
2289///
2290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2291#[inline]
2292#[target_feature(enable = "sse2")]
2293#[cfg_attr(test, assert_instr(ucomisd))]
2294#[stable(feature = "simd_x86", since = "1.27.0")]
2295pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2296    unsafe { ucomiltsd(a, b) }
2297}
2298
2299/// Compares the lower element of `a` and `b` for less-than-or-equal.
2300///
2301/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2302#[inline]
2303#[target_feature(enable = "sse2")]
2304#[cfg_attr(test, assert_instr(ucomisd))]
2305#[stable(feature = "simd_x86", since = "1.27.0")]
2306pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2307    unsafe { ucomilesd(a, b) }
2308}
2309
2310/// Compares the lower element of `a` and `b` for greater-than.
2311///
2312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2313#[inline]
2314#[target_feature(enable = "sse2")]
2315#[cfg_attr(test, assert_instr(ucomisd))]
2316#[stable(feature = "simd_x86", since = "1.27.0")]
2317pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2318    unsafe { ucomigtsd(a, b) }
2319}
2320
2321/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2322///
2323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2324#[inline]
2325#[target_feature(enable = "sse2")]
2326#[cfg_attr(test, assert_instr(ucomisd))]
2327#[stable(feature = "simd_x86", since = "1.27.0")]
2328pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2329    unsafe { ucomigesd(a, b) }
2330}
2331
2332/// Compares the lower element of `a` and `b` for not-equal.
2333///
2334/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2335#[inline]
2336#[target_feature(enable = "sse2")]
2337#[cfg_attr(test, assert_instr(ucomisd))]
2338#[stable(feature = "simd_x86", since = "1.27.0")]
2339pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2340    unsafe { ucomineqsd(a, b) }
2341}
2342
2343/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2344/// packed single-precision (32-bit) floating-point elements
2345///
2346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2347#[inline]
2348#[target_feature(enable = "sse2")]
2349#[cfg_attr(test, assert_instr(cvtpd2ps))]
2350#[stable(feature = "simd_x86", since = "1.27.0")]
2351pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2352    unsafe {
2353        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2354        let zero = f32x2::ZERO;
2355        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2356    }
2357}
2358
2359/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2360/// packed
2361/// double-precision (64-bit) floating-point elements.
2362///
2363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2364#[inline]
2365#[target_feature(enable = "sse2")]
2366#[cfg_attr(test, assert_instr(cvtps2pd))]
2367#[stable(feature = "simd_x86", since = "1.27.0")]
2368pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2369    unsafe {
2370        let a = a.as_f32x4();
2371        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2372    }
2373}
2374
2375/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2376/// packed 32-bit integers.
2377///
2378/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2379#[inline]
2380#[target_feature(enable = "sse2")]
2381#[cfg_attr(test, assert_instr(cvtpd2dq))]
2382#[stable(feature = "simd_x86", since = "1.27.0")]
2383pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2384    unsafe { transmute(cvtpd2dq(a)) }
2385}
2386
2387/// Converts the lower double-precision (64-bit) floating-point element in a to
2388/// a 32-bit integer.
2389///
2390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2391#[inline]
2392#[target_feature(enable = "sse2")]
2393#[cfg_attr(test, assert_instr(cvtsd2si))]
2394#[stable(feature = "simd_x86", since = "1.27.0")]
2395pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2396    unsafe { cvtsd2si(a) }
2397}
2398
2399/// Converts the lower double-precision (64-bit) floating-point element in `b`
2400/// to a single-precision (32-bit) floating-point element, store the result in
2401/// the lower element of the return value, and copies the upper element from `a`
2402/// to the upper element the return value.
2403///
2404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2405#[inline]
2406#[target_feature(enable = "sse2")]
2407#[cfg_attr(test, assert_instr(cvtsd2ss))]
2408#[stable(feature = "simd_x86", since = "1.27.0")]
2409pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2410    unsafe { cvtsd2ss(a, b) }
2411}
2412
2413/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2414///
2415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2416#[inline]
2417#[target_feature(enable = "sse2")]
2418#[stable(feature = "simd_x86", since = "1.27.0")]
2419pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2420    unsafe { simd_extract!(a, 0) }
2421}
2422
2423/// Converts the lower single-precision (32-bit) floating-point element in `b`
2424/// to a double-precision (64-bit) floating-point element, store the result in
2425/// the lower element of the return value, and copies the upper element from `a`
2426/// to the upper element the return value.
2427///
2428/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2429#[inline]
2430#[target_feature(enable = "sse2")]
2431#[cfg_attr(test, assert_instr(cvtss2sd))]
2432#[stable(feature = "simd_x86", since = "1.27.0")]
2433pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2434    unsafe {
2435        let elt: f32 = simd_extract!(b, 0);
2436        simd_insert!(a, 0, elt as f64)
2437    }
2438}
2439
2440/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2441/// packed 32-bit integers with truncation.
2442///
2443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2444#[inline]
2445#[target_feature(enable = "sse2")]
2446#[cfg_attr(test, assert_instr(cvttpd2dq))]
2447#[stable(feature = "simd_x86", since = "1.27.0")]
2448pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2449    unsafe { transmute(cvttpd2dq(a)) }
2450}
2451
2452/// Converts the lower double-precision (64-bit) floating-point element in `a`
2453/// to a 32-bit integer with truncation.
2454///
2455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2456#[inline]
2457#[target_feature(enable = "sse2")]
2458#[cfg_attr(test, assert_instr(cvttsd2si))]
2459#[stable(feature = "simd_x86", since = "1.27.0")]
2460pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2461    unsafe { cvttsd2si(a) }
2462}
2463
2464/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2465/// packed 32-bit integers with truncation.
2466///
2467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2468#[inline]
2469#[target_feature(enable = "sse2")]
2470#[cfg_attr(test, assert_instr(cvttps2dq))]
2471#[stable(feature = "simd_x86", since = "1.27.0")]
2472pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2473    unsafe { transmute(cvttps2dq(a)) }
2474}
2475
2476/// Copies double-precision (64-bit) floating-point element `a` to the lower
2477/// element of the packed 64-bit return value.
2478///
2479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2480#[inline]
2481#[target_feature(enable = "sse2")]
2482#[stable(feature = "simd_x86", since = "1.27.0")]
2483pub fn _mm_set_sd(a: f64) -> __m128d {
2484    _mm_set_pd(0.0, a)
2485}
2486
2487/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2488/// of the return value.
2489///
2490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2491#[inline]
2492#[target_feature(enable = "sse2")]
2493#[stable(feature = "simd_x86", since = "1.27.0")]
2494pub fn _mm_set1_pd(a: f64) -> __m128d {
2495    _mm_set_pd(a, a)
2496}
2497
2498/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2499/// of the return value.
2500///
2501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2502#[inline]
2503#[target_feature(enable = "sse2")]
2504#[stable(feature = "simd_x86", since = "1.27.0")]
2505pub fn _mm_set_pd1(a: f64) -> __m128d {
2506    _mm_set_pd(a, a)
2507}
2508
2509/// Sets packed double-precision (64-bit) floating-point elements in the return
2510/// value with the supplied values.
2511///
2512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2513#[inline]
2514#[target_feature(enable = "sse2")]
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2517    __m128d([b, a])
2518}
2519
2520/// Sets packed double-precision (64-bit) floating-point elements in the return
2521/// value with the supplied values in reverse order.
2522///
2523/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2524#[inline]
2525#[target_feature(enable = "sse2")]
2526#[stable(feature = "simd_x86", since = "1.27.0")]
2527pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2528    _mm_set_pd(b, a)
2529}
2530
2531/// Returns packed double-precision (64-bit) floating-point elements with all
2532/// zeros.
2533///
2534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2535#[inline]
2536#[target_feature(enable = "sse2")]
2537#[cfg_attr(test, assert_instr(xorp))]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539pub fn _mm_setzero_pd() -> __m128d {
2540    const { unsafe { mem::zeroed() } }
2541}
2542
2543/// Returns a mask of the most significant bit of each element in `a`.
2544///
2545/// The mask is stored in the 2 least significant bits of the return value.
2546/// All other bits are set to `0`.
2547///
2548/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2549#[inline]
2550#[target_feature(enable = "sse2")]
2551#[cfg_attr(test, assert_instr(movmskpd))]
2552#[stable(feature = "simd_x86", since = "1.27.0")]
2553pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2554    // Propagate the highest bit to the rest, because simd_bitmask
2555    // requires all-1 or all-0.
2556    unsafe {
2557        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2558        simd_bitmask::<i64x2, u8>(mask).into()
2559    }
2560}
2561
2562/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2563/// floating-point elements) from memory into the returned vector.
2564/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2565/// exception may be generated.
2566///
2567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2568#[inline]
2569#[target_feature(enable = "sse2")]
2570#[cfg_attr(
2571    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2572    assert_instr(movaps)
2573)]
2574#[stable(feature = "simd_x86", since = "1.27.0")]
2575#[allow(clippy::cast_ptr_alignment)]
2576pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2577    *(mem_addr as *const __m128d)
2578}
2579
2580/// Loads a 64-bit double-precision value to the low element of a
2581/// 128-bit integer vector and clears the upper element.
2582///
2583/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2584#[inline]
2585#[target_feature(enable = "sse2")]
2586#[cfg_attr(test, assert_instr(movsd))]
2587#[stable(feature = "simd_x86", since = "1.27.0")]
2588pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2589    _mm_setr_pd(*mem_addr, 0.)
2590}
2591
2592/// Loads a double-precision value into the high-order bits of a 128-bit
2593/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2594/// bits of the first operand.
2595///
2596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2597#[inline]
2598#[target_feature(enable = "sse2")]
2599#[cfg_attr(test, assert_instr(movhps))]
2600#[stable(feature = "simd_x86", since = "1.27.0")]
2601pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2602    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2603}
2604
2605/// Loads a double-precision value into the low-order bits of a 128-bit
2606/// vector of `[2 x double]`. The high-order bits are copied from the
2607/// high-order bits of the first operand.
2608///
2609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2610#[inline]
2611#[target_feature(enable = "sse2")]
2612#[cfg_attr(test, assert_instr(movlps))]
2613#[stable(feature = "simd_x86", since = "1.27.0")]
2614pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2615    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2616}
2617
2618/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2619/// aligned memory location.
2620/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2621/// used again soon).
2622///
2623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2624///
2625/// # Safety of non-temporal stores
2626///
2627/// After using this intrinsic, but before any other access to the memory that this intrinsic
2628/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2629/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2630/// return.
2631///
2632/// See [`_mm_sfence`] for details.
2633#[inline]
2634#[target_feature(enable = "sse2")]
2635#[cfg_attr(test, assert_instr(movntpd))]
2636#[stable(feature = "simd_x86", since = "1.27.0")]
2637#[allow(clippy::cast_ptr_alignment)]
2638pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2639    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2640    crate::arch::asm!(
2641        vps!("movntpd", ",{a}"),
2642        p = in(reg) mem_addr,
2643        a = in(xmm_reg) a,
2644        options(nostack, preserves_flags),
2645    );
2646}
2647
2648/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2649/// memory location.
2650///
2651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2652#[inline]
2653#[target_feature(enable = "sse2")]
2654#[cfg_attr(test, assert_instr(movlps))]
2655#[stable(feature = "simd_x86", since = "1.27.0")]
2656pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2657    *mem_addr = simd_extract!(a, 0)
2658}
2659
2660/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2661/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2662/// on a 16-byte boundary or a general-protection exception may be generated.
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2665#[inline]
2666#[target_feature(enable = "sse2")]
2667#[cfg_attr(
2668    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2669    assert_instr(movaps)
2670)]
2671#[stable(feature = "simd_x86", since = "1.27.0")]
2672#[allow(clippy::cast_ptr_alignment)]
2673pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2674    *(mem_addr as *mut __m128d) = a;
2675}
2676
2677/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2678/// floating-point elements) from `a` into memory.
2679/// `mem_addr` does not need to be aligned on any particular boundary.
2680///
2681/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2682#[inline]
2683#[target_feature(enable = "sse2")]
2684#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2685#[stable(feature = "simd_x86", since = "1.27.0")]
2686pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2687    mem_addr.cast::<__m128d>().write_unaligned(a);
2688}
2689
2690/// Store 16-bit integer from the first element of a into memory.
2691///
2692/// `mem_addr` does not need to be aligned on any particular boundary.
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2695#[inline]
2696#[target_feature(enable = "sse2")]
2697#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2698pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2699    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2700}
2701
2702/// Store 32-bit integer from the first element of a into memory.
2703///
2704/// `mem_addr` does not need to be aligned on any particular boundary.
2705///
2706/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2707#[inline]
2708#[target_feature(enable = "sse2")]
2709#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2710pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2711    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2712}
2713
2714/// Store 64-bit integer from the first element of a into memory.
2715///
2716/// `mem_addr` does not need to be aligned on any particular boundary.
2717///
2718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2719#[inline]
2720#[target_feature(enable = "sse2")]
2721#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2722pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2723    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2724}
2725
2726/// Stores the lower double-precision (64-bit) floating-point element from `a`
2727/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2728/// 16-byte boundary or a general-protection exception may be generated.
2729///
2730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2731#[inline]
2732#[target_feature(enable = "sse2")]
2733#[stable(feature = "simd_x86", since = "1.27.0")]
2734#[allow(clippy::cast_ptr_alignment)]
2735pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2736    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2737    *(mem_addr as *mut __m128d) = b;
2738}
2739
2740/// Stores the lower double-precision (64-bit) floating-point element from `a`
2741/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2742/// 16-byte boundary or a general-protection exception may be generated.
2743///
2744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2745#[inline]
2746#[target_feature(enable = "sse2")]
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748#[allow(clippy::cast_ptr_alignment)]
2749pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2750    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2751    *(mem_addr as *mut __m128d) = b;
2752}
2753
2754/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2755/// memory in reverse order.
2756/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2757/// exception may be generated.
2758///
2759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2760#[inline]
2761#[target_feature(enable = "sse2")]
2762#[stable(feature = "simd_x86", since = "1.27.0")]
2763#[allow(clippy::cast_ptr_alignment)]
2764pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2765    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2766    *(mem_addr as *mut __m128d) = b;
2767}
2768
2769/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2770/// memory location.
2771///
2772/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2773#[inline]
2774#[target_feature(enable = "sse2")]
2775#[cfg_attr(test, assert_instr(movhps))]
2776#[stable(feature = "simd_x86", since = "1.27.0")]
2777pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2778    *mem_addr = simd_extract!(a, 1);
2779}
2780
2781/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2782/// memory location.
2783///
2784/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2785#[inline]
2786#[target_feature(enable = "sse2")]
2787#[cfg_attr(test, assert_instr(movlps))]
2788#[stable(feature = "simd_x86", since = "1.27.0")]
2789pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2790    *mem_addr = simd_extract!(a, 0);
2791}
2792
2793/// Loads a double-precision (64-bit) floating-point element from memory
2794/// into both elements of returned vector.
2795///
2796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2797#[inline]
2798#[target_feature(enable = "sse2")]
2799// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2800#[stable(feature = "simd_x86", since = "1.27.0")]
2801pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2802    let d = *mem_addr;
2803    _mm_setr_pd(d, d)
2804}
2805
2806/// Loads a double-precision (64-bit) floating-point element from memory
2807/// into both elements of returned vector.
2808///
2809/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2810#[inline]
2811#[target_feature(enable = "sse2")]
2812// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2813#[stable(feature = "simd_x86", since = "1.27.0")]
2814pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2815    _mm_load1_pd(mem_addr)
2816}
2817
2818/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2819/// the returned vector in reverse order. `mem_addr` must be aligned on a
2820/// 16-byte boundary or a general-protection exception may be generated.
2821///
2822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2823#[inline]
2824#[target_feature(enable = "sse2")]
2825#[cfg_attr(
2826    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2827    assert_instr(movaps)
2828)]
2829#[stable(feature = "simd_x86", since = "1.27.0")]
2830pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2831    let a = _mm_load_pd(mem_addr);
2832    simd_shuffle!(a, a, [1, 0])
2833}
2834
2835/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2836/// floating-point elements) from memory into the returned vector.
2837/// `mem_addr` does not need to be aligned on any particular boundary.
2838///
2839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2840#[inline]
2841#[target_feature(enable = "sse2")]
2842#[cfg_attr(test, assert_instr(movups))]
2843#[stable(feature = "simd_x86", since = "1.27.0")]
2844pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2845    let mut dst = _mm_undefined_pd();
2846    ptr::copy_nonoverlapping(
2847        mem_addr as *const u8,
2848        ptr::addr_of_mut!(dst) as *mut u8,
2849        mem::size_of::<__m128d>(),
2850    );
2851    dst
2852}
2853
2854/// Loads unaligned 16-bits of integer data from memory into new vector.
2855///
2856/// `mem_addr` does not need to be aligned on any particular boundary.
2857///
2858/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2859#[inline]
2860#[target_feature(enable = "sse2")]
2861#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2862pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2863    transmute(i16x8::new(
2864        ptr::read_unaligned(mem_addr as *const i16),
2865        0,
2866        0,
2867        0,
2868        0,
2869        0,
2870        0,
2871        0,
2872    ))
2873}
2874
2875/// Loads unaligned 32-bits of integer data from memory into new vector.
2876///
2877/// `mem_addr` does not need to be aligned on any particular boundary.
2878///
2879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2880#[inline]
2881#[target_feature(enable = "sse2")]
2882#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2883pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2884    transmute(i32x4::new(
2885        ptr::read_unaligned(mem_addr as *const i32),
2886        0,
2887        0,
2888        0,
2889    ))
2890}
2891
2892/// Loads unaligned 64-bits of integer data from memory into new vector.
2893///
2894/// `mem_addr` does not need to be aligned on any particular boundary.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2900pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2901    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2902}
2903
2904/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2905/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2906/// parameter as a specifier.
2907///
2908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2909#[inline]
2910#[target_feature(enable = "sse2")]
2911#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2912#[rustc_legacy_const_generics(2)]
2913#[stable(feature = "simd_x86", since = "1.27.0")]
2914pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2915    static_assert_uimm_bits!(MASK, 8);
2916    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2917}
2918
2919/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2920/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2921/// 64 bits are set to the upper 64 bits of the first parameter.
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2924#[inline]
2925#[target_feature(enable = "sse2")]
2926#[cfg_attr(test, assert_instr(movsd))]
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2929    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2930}
2931
2932/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2933/// floating-point vector of `[4 x float]`.
2934///
2935/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2936#[inline]
2937#[target_feature(enable = "sse2")]
2938#[stable(feature = "simd_x86", since = "1.27.0")]
2939pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2940    unsafe { transmute(a) }
2941}
2942
2943/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2944/// integer vector.
2945///
2946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2947#[inline]
2948#[target_feature(enable = "sse2")]
2949#[stable(feature = "simd_x86", since = "1.27.0")]
2950pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2951    unsafe { transmute(a) }
2952}
2953
2954/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2955/// floating-point vector of `[2 x double]`.
2956///
2957/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2958#[inline]
2959#[target_feature(enable = "sse2")]
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961pub fn _mm_castps_pd(a: __m128) -> __m128d {
2962    unsafe { transmute(a) }
2963}
2964
2965/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2966/// integer vector.
2967///
2968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2969#[inline]
2970#[target_feature(enable = "sse2")]
2971#[stable(feature = "simd_x86", since = "1.27.0")]
2972pub fn _mm_castps_si128(a: __m128) -> __m128i {
2973    unsafe { transmute(a) }
2974}
2975
2976/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2977/// of `[2 x double]`.
2978///
2979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2980#[inline]
2981#[target_feature(enable = "sse2")]
2982#[stable(feature = "simd_x86", since = "1.27.0")]
2983pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2984    unsafe { transmute(a) }
2985}
2986
2987/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2988/// of `[4 x float]`.
2989///
2990/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2991#[inline]
2992#[target_feature(enable = "sse2")]
2993#[stable(feature = "simd_x86", since = "1.27.0")]
2994pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2995    unsafe { transmute(a) }
2996}
2997
2998/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
2999/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3000/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3001/// In practice, this is typically equivalent to [`mem::zeroed`].
3002///
3003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
3004#[inline]
3005#[target_feature(enable = "sse2")]
3006#[stable(feature = "simd_x86", since = "1.27.0")]
3007pub fn _mm_undefined_pd() -> __m128d {
3008    const { unsafe { mem::zeroed() } }
3009}
3010
3011/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
3012/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
3013/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
3014/// In practice, this is typically equivalent to [`mem::zeroed`].
3015///
3016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
3017#[inline]
3018#[target_feature(enable = "sse2")]
3019#[stable(feature = "simd_x86", since = "1.27.0")]
3020pub fn _mm_undefined_si128() -> __m128i {
3021    const { unsafe { mem::zeroed() } }
3022}
3023
3024/// The resulting `__m128d` element is composed by the low-order values of
3025/// the two `__m128d` interleaved input elements, i.e.:
3026///
3027/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3028/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3029///
3030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3031#[inline]
3032#[target_feature(enable = "sse2")]
3033#[cfg_attr(test, assert_instr(unpckhpd))]
3034#[stable(feature = "simd_x86", since = "1.27.0")]
3035pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3036    unsafe { simd_shuffle!(a, b, [1, 3]) }
3037}
3038
3039/// The resulting `__m128d` element is composed by the high-order values of
3040/// the two `__m128d` interleaved input elements, i.e.:
3041///
3042/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3043/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3044///
3045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3046#[inline]
3047#[target_feature(enable = "sse2")]
3048#[cfg_attr(test, assert_instr(movlhps))]
3049#[stable(feature = "simd_x86", since = "1.27.0")]
3050pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3051    unsafe { simd_shuffle!(a, b, [0, 2]) }
3052}
3053
3054#[allow(improper_ctypes)]
3055unsafe extern "C" {
3056    #[link_name = "llvm.x86.sse2.pause"]
3057    fn pause();
3058    #[link_name = "llvm.x86.sse2.clflush"]
3059    fn clflush(p: *const u8);
3060    #[link_name = "llvm.x86.sse2.lfence"]
3061    fn lfence();
3062    #[link_name = "llvm.x86.sse2.mfence"]
3063    fn mfence();
3064    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3065    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3066    #[link_name = "llvm.x86.sse2.psad.bw"]
3067    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3068    #[link_name = "llvm.x86.sse2.psll.w"]
3069    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3070    #[link_name = "llvm.x86.sse2.psll.d"]
3071    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3072    #[link_name = "llvm.x86.sse2.psll.q"]
3073    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3074    #[link_name = "llvm.x86.sse2.psra.w"]
3075    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3076    #[link_name = "llvm.x86.sse2.psra.d"]
3077    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3078    #[link_name = "llvm.x86.sse2.psrl.w"]
3079    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3080    #[link_name = "llvm.x86.sse2.psrl.d"]
3081    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3082    #[link_name = "llvm.x86.sse2.psrl.q"]
3083    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3084    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3085    fn cvtps2dq(a: __m128) -> i32x4;
3086    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3087    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3088    #[link_name = "llvm.x86.sse2.packsswb.128"]
3089    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3090    #[link_name = "llvm.x86.sse2.packssdw.128"]
3091    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3092    #[link_name = "llvm.x86.sse2.packuswb.128"]
3093    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3094    #[link_name = "llvm.x86.sse2.max.sd"]
3095    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3096    #[link_name = "llvm.x86.sse2.max.pd"]
3097    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3098    #[link_name = "llvm.x86.sse2.min.sd"]
3099    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3100    #[link_name = "llvm.x86.sse2.min.pd"]
3101    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3102    #[link_name = "llvm.x86.sse2.cmp.sd"]
3103    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3104    #[link_name = "llvm.x86.sse2.cmp.pd"]
3105    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3106    #[link_name = "llvm.x86.sse2.comieq.sd"]
3107    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3108    #[link_name = "llvm.x86.sse2.comilt.sd"]
3109    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3110    #[link_name = "llvm.x86.sse2.comile.sd"]
3111    fn comilesd(a: __m128d, b: __m128d) -> i32;
3112    #[link_name = "llvm.x86.sse2.comigt.sd"]
3113    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3114    #[link_name = "llvm.x86.sse2.comige.sd"]
3115    fn comigesd(a: __m128d, b: __m128d) -> i32;
3116    #[link_name = "llvm.x86.sse2.comineq.sd"]
3117    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3118    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3119    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3120    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3121    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3122    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3123    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3124    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3125    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3126    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3127    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3128    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3129    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3130    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3131    fn cvtpd2dq(a: __m128d) -> i32x4;
3132    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3133    fn cvtsd2si(a: __m128d) -> i32;
3134    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3135    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3136    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3137    fn cvttpd2dq(a: __m128d) -> i32x4;
3138    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3139    fn cvttsd2si(a: __m128d) -> i32;
3140    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3141    fn cvttps2dq(a: __m128) -> i32x4;
3142}
3143
3144#[cfg(test)]
3145mod tests {
3146    use crate::{
3147        core_arch::{simd::*, x86::*},
3148        hint::black_box,
3149    };
3150    use std::{
3151        boxed, f32, f64,
3152        mem::{self, transmute},
3153        ptr,
3154    };
3155    use stdarch_test::simd_test;
3156
3157    const NAN: f64 = f64::NAN;
3158
3159    #[test]
3160    fn test_mm_pause() {
3161        _mm_pause()
3162    }
3163
3164    #[simd_test(enable = "sse2")]
3165    unsafe fn test_mm_clflush() {
3166        let x = 0_u8;
3167        _mm_clflush(ptr::addr_of!(x));
3168    }
3169
3170    #[simd_test(enable = "sse2")]
3171    // Miri cannot support this until it is clear how it fits in the Rust memory model
3172    #[cfg_attr(miri, ignore)]
3173    unsafe fn test_mm_lfence() {
3174        _mm_lfence();
3175    }
3176
3177    #[simd_test(enable = "sse2")]
3178    // Miri cannot support this until it is clear how it fits in the Rust memory model
3179    #[cfg_attr(miri, ignore)]
3180    unsafe fn test_mm_mfence() {
3181        _mm_mfence();
3182    }
3183
3184    #[simd_test(enable = "sse2")]
3185    unsafe fn test_mm_add_epi8() {
3186        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3187        #[rustfmt::skip]
3188        let b = _mm_setr_epi8(
3189            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3190        );
3191        let r = _mm_add_epi8(a, b);
3192        #[rustfmt::skip]
3193        let e = _mm_setr_epi8(
3194            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3195        );
3196        assert_eq_m128i(r, e);
3197    }
3198
3199    #[simd_test(enable = "sse2")]
3200    unsafe fn test_mm_add_epi8_overflow() {
3201        let a = _mm_set1_epi8(0x7F);
3202        let b = _mm_set1_epi8(1);
3203        let r = _mm_add_epi8(a, b);
3204        assert_eq_m128i(r, _mm_set1_epi8(-128));
3205    }
3206
3207    #[simd_test(enable = "sse2")]
3208    unsafe fn test_mm_add_epi16() {
3209        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3210        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3211        let r = _mm_add_epi16(a, b);
3212        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3213        assert_eq_m128i(r, e);
3214    }
3215
3216    #[simd_test(enable = "sse2")]
3217    unsafe fn test_mm_add_epi32() {
3218        let a = _mm_setr_epi32(0, 1, 2, 3);
3219        let b = _mm_setr_epi32(4, 5, 6, 7);
3220        let r = _mm_add_epi32(a, b);
3221        let e = _mm_setr_epi32(4, 6, 8, 10);
3222        assert_eq_m128i(r, e);
3223    }
3224
3225    #[simd_test(enable = "sse2")]
3226    unsafe fn test_mm_add_epi64() {
3227        let a = _mm_setr_epi64x(0, 1);
3228        let b = _mm_setr_epi64x(2, 3);
3229        let r = _mm_add_epi64(a, b);
3230        let e = _mm_setr_epi64x(2, 4);
3231        assert_eq_m128i(r, e);
3232    }
3233
3234    #[simd_test(enable = "sse2")]
3235    unsafe fn test_mm_adds_epi8() {
3236        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3237        #[rustfmt::skip]
3238        let b = _mm_setr_epi8(
3239            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3240        );
3241        let r = _mm_adds_epi8(a, b);
3242        #[rustfmt::skip]
3243        let e = _mm_setr_epi8(
3244            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3245        );
3246        assert_eq_m128i(r, e);
3247    }
3248
3249    #[simd_test(enable = "sse2")]
3250    unsafe fn test_mm_adds_epi8_saturate_positive() {
3251        let a = _mm_set1_epi8(0x7F);
3252        let b = _mm_set1_epi8(1);
3253        let r = _mm_adds_epi8(a, b);
3254        assert_eq_m128i(r, a);
3255    }
3256
3257    #[simd_test(enable = "sse2")]
3258    unsafe fn test_mm_adds_epi8_saturate_negative() {
3259        let a = _mm_set1_epi8(-0x80);
3260        let b = _mm_set1_epi8(-1);
3261        let r = _mm_adds_epi8(a, b);
3262        assert_eq_m128i(r, a);
3263    }
3264
3265    #[simd_test(enable = "sse2")]
3266    unsafe fn test_mm_adds_epi16() {
3267        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3268        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3269        let r = _mm_adds_epi16(a, b);
3270        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3271        assert_eq_m128i(r, e);
3272    }
3273
3274    #[simd_test(enable = "sse2")]
3275    unsafe fn test_mm_adds_epi16_saturate_positive() {
3276        let a = _mm_set1_epi16(0x7FFF);
3277        let b = _mm_set1_epi16(1);
3278        let r = _mm_adds_epi16(a, b);
3279        assert_eq_m128i(r, a);
3280    }
3281
3282    #[simd_test(enable = "sse2")]
3283    unsafe fn test_mm_adds_epi16_saturate_negative() {
3284        let a = _mm_set1_epi16(-0x8000);
3285        let b = _mm_set1_epi16(-1);
3286        let r = _mm_adds_epi16(a, b);
3287        assert_eq_m128i(r, a);
3288    }
3289
3290    #[simd_test(enable = "sse2")]
3291    unsafe fn test_mm_adds_epu8() {
3292        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3293        #[rustfmt::skip]
3294        let b = _mm_setr_epi8(
3295            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3296        );
3297        let r = _mm_adds_epu8(a, b);
3298        #[rustfmt::skip]
3299        let e = _mm_setr_epi8(
3300            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3301        );
3302        assert_eq_m128i(r, e);
3303    }
3304
3305    #[simd_test(enable = "sse2")]
3306    unsafe fn test_mm_adds_epu8_saturate() {
3307        let a = _mm_set1_epi8(!0);
3308        let b = _mm_set1_epi8(1);
3309        let r = _mm_adds_epu8(a, b);
3310        assert_eq_m128i(r, a);
3311    }
3312
3313    #[simd_test(enable = "sse2")]
3314    unsafe fn test_mm_adds_epu16() {
3315        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3316        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3317        let r = _mm_adds_epu16(a, b);
3318        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3319        assert_eq_m128i(r, e);
3320    }
3321
3322    #[simd_test(enable = "sse2")]
3323    unsafe fn test_mm_adds_epu16_saturate() {
3324        let a = _mm_set1_epi16(!0);
3325        let b = _mm_set1_epi16(1);
3326        let r = _mm_adds_epu16(a, b);
3327        assert_eq_m128i(r, a);
3328    }
3329
3330    #[simd_test(enable = "sse2")]
3331    unsafe fn test_mm_avg_epu8() {
3332        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3333        let r = _mm_avg_epu8(a, b);
3334        assert_eq_m128i(r, _mm_set1_epi8(6));
3335    }
3336
3337    #[simd_test(enable = "sse2")]
3338    unsafe fn test_mm_avg_epu16() {
3339        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3340        let r = _mm_avg_epu16(a, b);
3341        assert_eq_m128i(r, _mm_set1_epi16(6));
3342    }
3343
3344    #[simd_test(enable = "sse2")]
3345    unsafe fn test_mm_madd_epi16() {
3346        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3347        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3348        let r = _mm_madd_epi16(a, b);
3349        let e = _mm_setr_epi32(29, 81, 149, 233);
3350        assert_eq_m128i(r, e);
3351
3352        // Test large values.
3353        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3354        let a = _mm_setr_epi16(
3355            i16::MAX,
3356            i16::MAX,
3357            i16::MIN,
3358            i16::MIN,
3359            i16::MIN,
3360            i16::MAX,
3361            0,
3362            0,
3363        );
3364        let b = _mm_setr_epi16(
3365            i16::MAX,
3366            i16::MAX,
3367            i16::MIN,
3368            i16::MIN,
3369            i16::MAX,
3370            i16::MIN,
3371            0,
3372            0,
3373        );
3374        let r = _mm_madd_epi16(a, b);
3375        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3376        assert_eq_m128i(r, e);
3377    }
3378
3379    #[simd_test(enable = "sse2")]
3380    unsafe fn test_mm_max_epi16() {
3381        let a = _mm_set1_epi16(1);
3382        let b = _mm_set1_epi16(-1);
3383        let r = _mm_max_epi16(a, b);
3384        assert_eq_m128i(r, a);
3385    }
3386
3387    #[simd_test(enable = "sse2")]
3388    unsafe fn test_mm_max_epu8() {
3389        let a = _mm_set1_epi8(1);
3390        let b = _mm_set1_epi8(!0);
3391        let r = _mm_max_epu8(a, b);
3392        assert_eq_m128i(r, b);
3393    }
3394
3395    #[simd_test(enable = "sse2")]
3396    unsafe fn test_mm_min_epi16() {
3397        let a = _mm_set1_epi16(1);
3398        let b = _mm_set1_epi16(-1);
3399        let r = _mm_min_epi16(a, b);
3400        assert_eq_m128i(r, b);
3401    }
3402
3403    #[simd_test(enable = "sse2")]
3404    unsafe fn test_mm_min_epu8() {
3405        let a = _mm_set1_epi8(1);
3406        let b = _mm_set1_epi8(!0);
3407        let r = _mm_min_epu8(a, b);
3408        assert_eq_m128i(r, a);
3409    }
3410
3411    #[simd_test(enable = "sse2")]
3412    unsafe fn test_mm_mulhi_epi16() {
3413        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3414        let r = _mm_mulhi_epi16(a, b);
3415        assert_eq_m128i(r, _mm_set1_epi16(-16));
3416    }
3417
3418    #[simd_test(enable = "sse2")]
3419    unsafe fn test_mm_mulhi_epu16() {
3420        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3421        let r = _mm_mulhi_epu16(a, b);
3422        assert_eq_m128i(r, _mm_set1_epi16(15));
3423    }
3424
3425    #[simd_test(enable = "sse2")]
3426    unsafe fn test_mm_mullo_epi16() {
3427        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3428        let r = _mm_mullo_epi16(a, b);
3429        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3430    }
3431
3432    #[simd_test(enable = "sse2")]
3433    unsafe fn test_mm_mul_epu32() {
3434        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3435        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3436        let r = _mm_mul_epu32(a, b);
3437        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3438        assert_eq_m128i(r, e);
3439    }
3440
3441    #[simd_test(enable = "sse2")]
3442    unsafe fn test_mm_sad_epu8() {
3443        #[rustfmt::skip]
3444        let a = _mm_setr_epi8(
3445            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3446            1, 2, 3, 4,
3447            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3448            1, 2, 3, 4,
3449        );
3450        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3451        let r = _mm_sad_epu8(a, b);
3452        let e = _mm_setr_epi64x(1020, 614);
3453        assert_eq_m128i(r, e);
3454    }
3455
3456    #[simd_test(enable = "sse2")]
3457    unsafe fn test_mm_sub_epi8() {
3458        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3459        let r = _mm_sub_epi8(a, b);
3460        assert_eq_m128i(r, _mm_set1_epi8(-1));
3461    }
3462
3463    #[simd_test(enable = "sse2")]
3464    unsafe fn test_mm_sub_epi16() {
3465        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3466        let r = _mm_sub_epi16(a, b);
3467        assert_eq_m128i(r, _mm_set1_epi16(-1));
3468    }
3469
3470    #[simd_test(enable = "sse2")]
3471    unsafe fn test_mm_sub_epi32() {
3472        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3473        let r = _mm_sub_epi32(a, b);
3474        assert_eq_m128i(r, _mm_set1_epi32(-1));
3475    }
3476
3477    #[simd_test(enable = "sse2")]
3478    unsafe fn test_mm_sub_epi64() {
3479        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3480        let r = _mm_sub_epi64(a, b);
3481        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3482    }
3483
3484    #[simd_test(enable = "sse2")]
3485    unsafe fn test_mm_subs_epi8() {
3486        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3487        let r = _mm_subs_epi8(a, b);
3488        assert_eq_m128i(r, _mm_set1_epi8(3));
3489    }
3490
3491    #[simd_test(enable = "sse2")]
3492    unsafe fn test_mm_subs_epi8_saturate_positive() {
3493        let a = _mm_set1_epi8(0x7F);
3494        let b = _mm_set1_epi8(-1);
3495        let r = _mm_subs_epi8(a, b);
3496        assert_eq_m128i(r, a);
3497    }
3498
3499    #[simd_test(enable = "sse2")]
3500    unsafe fn test_mm_subs_epi8_saturate_negative() {
3501        let a = _mm_set1_epi8(-0x80);
3502        let b = _mm_set1_epi8(1);
3503        let r = _mm_subs_epi8(a, b);
3504        assert_eq_m128i(r, a);
3505    }
3506
3507    #[simd_test(enable = "sse2")]
3508    unsafe fn test_mm_subs_epi16() {
3509        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3510        let r = _mm_subs_epi16(a, b);
3511        assert_eq_m128i(r, _mm_set1_epi16(3));
3512    }
3513
3514    #[simd_test(enable = "sse2")]
3515    unsafe fn test_mm_subs_epi16_saturate_positive() {
3516        let a = _mm_set1_epi16(0x7FFF);
3517        let b = _mm_set1_epi16(-1);
3518        let r = _mm_subs_epi16(a, b);
3519        assert_eq_m128i(r, a);
3520    }
3521
3522    #[simd_test(enable = "sse2")]
3523    unsafe fn test_mm_subs_epi16_saturate_negative() {
3524        let a = _mm_set1_epi16(-0x8000);
3525        let b = _mm_set1_epi16(1);
3526        let r = _mm_subs_epi16(a, b);
3527        assert_eq_m128i(r, a);
3528    }
3529
3530    #[simd_test(enable = "sse2")]
3531    unsafe fn test_mm_subs_epu8() {
3532        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3533        let r = _mm_subs_epu8(a, b);
3534        assert_eq_m128i(r, _mm_set1_epi8(3));
3535    }
3536
3537    #[simd_test(enable = "sse2")]
3538    unsafe fn test_mm_subs_epu8_saturate() {
3539        let a = _mm_set1_epi8(0);
3540        let b = _mm_set1_epi8(1);
3541        let r = _mm_subs_epu8(a, b);
3542        assert_eq_m128i(r, a);
3543    }
3544
3545    #[simd_test(enable = "sse2")]
3546    unsafe fn test_mm_subs_epu16() {
3547        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3548        let r = _mm_subs_epu16(a, b);
3549        assert_eq_m128i(r, _mm_set1_epi16(3));
3550    }
3551
3552    #[simd_test(enable = "sse2")]
3553    unsafe fn test_mm_subs_epu16_saturate() {
3554        let a = _mm_set1_epi16(0);
3555        let b = _mm_set1_epi16(1);
3556        let r = _mm_subs_epu16(a, b);
3557        assert_eq_m128i(r, a);
3558    }
3559
3560    #[simd_test(enable = "sse2")]
3561    unsafe fn test_mm_slli_si128() {
3562        #[rustfmt::skip]
3563        let a = _mm_setr_epi8(
3564            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3565        );
3566        let r = _mm_slli_si128::<1>(a);
3567        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3568        assert_eq_m128i(r, e);
3569
3570        #[rustfmt::skip]
3571        let a = _mm_setr_epi8(
3572            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3573        );
3574        let r = _mm_slli_si128::<15>(a);
3575        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3576        assert_eq_m128i(r, e);
3577
3578        #[rustfmt::skip]
3579        let a = _mm_setr_epi8(
3580            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3581        );
3582        let r = _mm_slli_si128::<16>(a);
3583        assert_eq_m128i(r, _mm_set1_epi8(0));
3584    }
3585
3586    #[simd_test(enable = "sse2")]
3587    unsafe fn test_mm_slli_epi16() {
3588        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3589        let r = _mm_slli_epi16::<4>(a);
3590        assert_eq_m128i(
3591            r,
3592            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3593        );
3594        let r = _mm_slli_epi16::<16>(a);
3595        assert_eq_m128i(r, _mm_set1_epi16(0));
3596    }
3597
3598    #[simd_test(enable = "sse2")]
3599    unsafe fn test_mm_sll_epi16() {
3600        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3601        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3602        assert_eq_m128i(
3603            r,
3604            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3605        );
3606        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3607        assert_eq_m128i(r, a);
3608        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3609        assert_eq_m128i(r, _mm_set1_epi16(0));
3610        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3611        assert_eq_m128i(r, _mm_set1_epi16(0));
3612    }
3613
3614    #[simd_test(enable = "sse2")]
3615    unsafe fn test_mm_slli_epi32() {
3616        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3617        let r = _mm_slli_epi32::<4>(a);
3618        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3619        let r = _mm_slli_epi32::<32>(a);
3620        assert_eq_m128i(r, _mm_set1_epi32(0));
3621    }
3622
3623    #[simd_test(enable = "sse2")]
3624    unsafe fn test_mm_sll_epi32() {
3625        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3626        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3627        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3628        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3629        assert_eq_m128i(r, a);
3630        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3631        assert_eq_m128i(r, _mm_set1_epi32(0));
3632        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3633        assert_eq_m128i(r, _mm_set1_epi32(0));
3634    }
3635
3636    #[simd_test(enable = "sse2")]
3637    unsafe fn test_mm_slli_epi64() {
3638        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3639        let r = _mm_slli_epi64::<4>(a);
3640        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3641        let r = _mm_slli_epi64::<64>(a);
3642        assert_eq_m128i(r, _mm_set1_epi64x(0));
3643    }
3644
3645    #[simd_test(enable = "sse2")]
3646    unsafe fn test_mm_sll_epi64() {
3647        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3648        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3649        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3650        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3651        assert_eq_m128i(r, a);
3652        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3653        assert_eq_m128i(r, _mm_set1_epi64x(0));
3654        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3655        assert_eq_m128i(r, _mm_set1_epi64x(0));
3656    }
3657
3658    #[simd_test(enable = "sse2")]
3659    unsafe fn test_mm_srai_epi16() {
3660        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3661        let r = _mm_srai_epi16::<4>(a);
3662        assert_eq_m128i(
3663            r,
3664            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3665        );
3666        let r = _mm_srai_epi16::<16>(a);
3667        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3668    }
3669
3670    #[simd_test(enable = "sse2")]
3671    unsafe fn test_mm_sra_epi16() {
3672        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3673        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3674        assert_eq_m128i(
3675            r,
3676            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3677        );
3678        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3679        assert_eq_m128i(r, a);
3680        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3681        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3682        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3683        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3684    }
3685
3686    #[simd_test(enable = "sse2")]
3687    unsafe fn test_mm_srai_epi32() {
3688        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3689        let r = _mm_srai_epi32::<4>(a);
3690        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3691        let r = _mm_srai_epi32::<32>(a);
3692        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3693    }
3694
3695    #[simd_test(enable = "sse2")]
3696    unsafe fn test_mm_sra_epi32() {
3697        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3698        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3699        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3700        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3701        assert_eq_m128i(r, a);
3702        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3703        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3704        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3705        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3706    }
3707
3708    #[simd_test(enable = "sse2")]
3709    unsafe fn test_mm_srli_si128() {
3710        #[rustfmt::skip]
3711        let a = _mm_setr_epi8(
3712            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3713        );
3714        let r = _mm_srli_si128::<1>(a);
3715        #[rustfmt::skip]
3716        let e = _mm_setr_epi8(
3717            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3718        );
3719        assert_eq_m128i(r, e);
3720
3721        #[rustfmt::skip]
3722        let a = _mm_setr_epi8(
3723            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3724        );
3725        let r = _mm_srli_si128::<15>(a);
3726        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3727        assert_eq_m128i(r, e);
3728
3729        #[rustfmt::skip]
3730        let a = _mm_setr_epi8(
3731            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3732        );
3733        let r = _mm_srli_si128::<16>(a);
3734        assert_eq_m128i(r, _mm_set1_epi8(0));
3735    }
3736
3737    #[simd_test(enable = "sse2")]
3738    unsafe fn test_mm_srli_epi16() {
3739        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3740        let r = _mm_srli_epi16::<4>(a);
3741        assert_eq_m128i(
3742            r,
3743            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3744        );
3745        let r = _mm_srli_epi16::<16>(a);
3746        assert_eq_m128i(r, _mm_set1_epi16(0));
3747    }
3748
3749    #[simd_test(enable = "sse2")]
3750    unsafe fn test_mm_srl_epi16() {
3751        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3752        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3753        assert_eq_m128i(
3754            r,
3755            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3756        );
3757        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3758        assert_eq_m128i(r, a);
3759        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3760        assert_eq_m128i(r, _mm_set1_epi16(0));
3761        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3762        assert_eq_m128i(r, _mm_set1_epi16(0));
3763    }
3764
3765    #[simd_test(enable = "sse2")]
3766    unsafe fn test_mm_srli_epi32() {
3767        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3768        let r = _mm_srli_epi32::<4>(a);
3769        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3770        let r = _mm_srli_epi32::<32>(a);
3771        assert_eq_m128i(r, _mm_set1_epi32(0));
3772    }
3773
3774    #[simd_test(enable = "sse2")]
3775    unsafe fn test_mm_srl_epi32() {
3776        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3777        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3778        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3779        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3780        assert_eq_m128i(r, a);
3781        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3782        assert_eq_m128i(r, _mm_set1_epi32(0));
3783        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3784        assert_eq_m128i(r, _mm_set1_epi32(0));
3785    }
3786
3787    #[simd_test(enable = "sse2")]
3788    unsafe fn test_mm_srli_epi64() {
3789        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3790        let r = _mm_srli_epi64::<4>(a);
3791        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3792        let r = _mm_srli_epi64::<64>(a);
3793        assert_eq_m128i(r, _mm_set1_epi64x(0));
3794    }
3795
3796    #[simd_test(enable = "sse2")]
3797    unsafe fn test_mm_srl_epi64() {
3798        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3799        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3800        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3801        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3802        assert_eq_m128i(r, a);
3803        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3804        assert_eq_m128i(r, _mm_set1_epi64x(0));
3805        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3806        assert_eq_m128i(r, _mm_set1_epi64x(0));
3807    }
3808
3809    #[simd_test(enable = "sse2")]
3810    unsafe fn test_mm_and_si128() {
3811        let a = _mm_set1_epi8(5);
3812        let b = _mm_set1_epi8(3);
3813        let r = _mm_and_si128(a, b);
3814        assert_eq_m128i(r, _mm_set1_epi8(1));
3815    }
3816
3817    #[simd_test(enable = "sse2")]
3818    unsafe fn test_mm_andnot_si128() {
3819        let a = _mm_set1_epi8(5);
3820        let b = _mm_set1_epi8(3);
3821        let r = _mm_andnot_si128(a, b);
3822        assert_eq_m128i(r, _mm_set1_epi8(2));
3823    }
3824
3825    #[simd_test(enable = "sse2")]
3826    unsafe fn test_mm_or_si128() {
3827        let a = _mm_set1_epi8(5);
3828        let b = _mm_set1_epi8(3);
3829        let r = _mm_or_si128(a, b);
3830        assert_eq_m128i(r, _mm_set1_epi8(7));
3831    }
3832
3833    #[simd_test(enable = "sse2")]
3834    unsafe fn test_mm_xor_si128() {
3835        let a = _mm_set1_epi8(5);
3836        let b = _mm_set1_epi8(3);
3837        let r = _mm_xor_si128(a, b);
3838        assert_eq_m128i(r, _mm_set1_epi8(6));
3839    }
3840
3841    #[simd_test(enable = "sse2")]
3842    unsafe fn test_mm_cmpeq_epi8() {
3843        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3844        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3845        let r = _mm_cmpeq_epi8(a, b);
3846        #[rustfmt::skip]
3847        assert_eq_m128i(
3848            r,
3849            _mm_setr_epi8(
3850                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3851            )
3852        );
3853    }
3854
3855    #[simd_test(enable = "sse2")]
3856    unsafe fn test_mm_cmpeq_epi16() {
3857        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3858        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3859        let r = _mm_cmpeq_epi16(a, b);
3860        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3861    }
3862
3863    #[simd_test(enable = "sse2")]
3864    unsafe fn test_mm_cmpeq_epi32() {
3865        let a = _mm_setr_epi32(0, 1, 2, 3);
3866        let b = _mm_setr_epi32(3, 2, 2, 0);
3867        let r = _mm_cmpeq_epi32(a, b);
3868        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3869    }
3870
3871    #[simd_test(enable = "sse2")]
3872    unsafe fn test_mm_cmpgt_epi8() {
3873        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3874        let b = _mm_set1_epi8(0);
3875        let r = _mm_cmpgt_epi8(a, b);
3876        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3877        assert_eq_m128i(r, e);
3878    }
3879
3880    #[simd_test(enable = "sse2")]
3881    unsafe fn test_mm_cmpgt_epi16() {
3882        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3883        let b = _mm_set1_epi16(0);
3884        let r = _mm_cmpgt_epi16(a, b);
3885        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3886        assert_eq_m128i(r, e);
3887    }
3888
3889    #[simd_test(enable = "sse2")]
3890    unsafe fn test_mm_cmpgt_epi32() {
3891        let a = _mm_set_epi32(5, 0, 0, 0);
3892        let b = _mm_set1_epi32(0);
3893        let r = _mm_cmpgt_epi32(a, b);
3894        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3895    }
3896
3897    #[simd_test(enable = "sse2")]
3898    unsafe fn test_mm_cmplt_epi8() {
3899        let a = _mm_set1_epi8(0);
3900        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3901        let r = _mm_cmplt_epi8(a, b);
3902        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3903        assert_eq_m128i(r, e);
3904    }
3905
3906    #[simd_test(enable = "sse2")]
3907    unsafe fn test_mm_cmplt_epi16() {
3908        let a = _mm_set1_epi16(0);
3909        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3910        let r = _mm_cmplt_epi16(a, b);
3911        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3912        assert_eq_m128i(r, e);
3913    }
3914
3915    #[simd_test(enable = "sse2")]
3916    unsafe fn test_mm_cmplt_epi32() {
3917        let a = _mm_set1_epi32(0);
3918        let b = _mm_set_epi32(5, 0, 0, 0);
3919        let r = _mm_cmplt_epi32(a, b);
3920        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3921    }
3922
3923    #[simd_test(enable = "sse2")]
3924    unsafe fn test_mm_cvtepi32_pd() {
3925        let a = _mm_set_epi32(35, 25, 15, 5);
3926        let r = _mm_cvtepi32_pd(a);
3927        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3928    }
3929
3930    #[simd_test(enable = "sse2")]
3931    unsafe fn test_mm_cvtsi32_sd() {
3932        let a = _mm_set1_pd(3.5);
3933        let r = _mm_cvtsi32_sd(a, 5);
3934        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3935    }
3936
3937    #[simd_test(enable = "sse2")]
3938    unsafe fn test_mm_cvtepi32_ps() {
3939        let a = _mm_setr_epi32(1, 2, 3, 4);
3940        let r = _mm_cvtepi32_ps(a);
3941        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3942    }
3943
3944    #[simd_test(enable = "sse2")]
3945    unsafe fn test_mm_cvtps_epi32() {
3946        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3947        let r = _mm_cvtps_epi32(a);
3948        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3949    }
3950
3951    #[simd_test(enable = "sse2")]
3952    unsafe fn test_mm_cvtsi32_si128() {
3953        let r = _mm_cvtsi32_si128(5);
3954        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3955    }
3956
3957    #[simd_test(enable = "sse2")]
3958    unsafe fn test_mm_cvtsi128_si32() {
3959        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3960        assert_eq!(r, 5);
3961    }
3962
3963    #[simd_test(enable = "sse2")]
3964    unsafe fn test_mm_set_epi64x() {
3965        let r = _mm_set_epi64x(0, 1);
3966        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3967    }
3968
3969    #[simd_test(enable = "sse2")]
3970    unsafe fn test_mm_set_epi32() {
3971        let r = _mm_set_epi32(0, 1, 2, 3);
3972        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3973    }
3974
3975    #[simd_test(enable = "sse2")]
3976    unsafe fn test_mm_set_epi16() {
3977        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3978        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3979    }
3980
3981    #[simd_test(enable = "sse2")]
3982    unsafe fn test_mm_set_epi8() {
3983        #[rustfmt::skip]
3984        let r = _mm_set_epi8(
3985            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3986        );
3987        #[rustfmt::skip]
3988        let e = _mm_setr_epi8(
3989            15, 14, 13, 12, 11, 10, 9, 8,
3990            7, 6, 5, 4, 3, 2, 1, 0,
3991        );
3992        assert_eq_m128i(r, e);
3993    }
3994
3995    #[simd_test(enable = "sse2")]
3996    unsafe fn test_mm_set1_epi64x() {
3997        let r = _mm_set1_epi64x(1);
3998        assert_eq_m128i(r, _mm_set1_epi64x(1));
3999    }
4000
4001    #[simd_test(enable = "sse2")]
4002    unsafe fn test_mm_set1_epi32() {
4003        let r = _mm_set1_epi32(1);
4004        assert_eq_m128i(r, _mm_set1_epi32(1));
4005    }
4006
4007    #[simd_test(enable = "sse2")]
4008    unsafe fn test_mm_set1_epi16() {
4009        let r = _mm_set1_epi16(1);
4010        assert_eq_m128i(r, _mm_set1_epi16(1));
4011    }
4012
4013    #[simd_test(enable = "sse2")]
4014    unsafe fn test_mm_set1_epi8() {
4015        let r = _mm_set1_epi8(1);
4016        assert_eq_m128i(r, _mm_set1_epi8(1));
4017    }
4018
4019    #[simd_test(enable = "sse2")]
4020    unsafe fn test_mm_setr_epi32() {
4021        let r = _mm_setr_epi32(0, 1, 2, 3);
4022        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
4023    }
4024
4025    #[simd_test(enable = "sse2")]
4026    unsafe fn test_mm_setr_epi16() {
4027        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4028        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4029    }
4030
4031    #[simd_test(enable = "sse2")]
4032    unsafe fn test_mm_setr_epi8() {
4033        #[rustfmt::skip]
4034        let r = _mm_setr_epi8(
4035            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4036        );
4037        #[rustfmt::skip]
4038        let e = _mm_setr_epi8(
4039            0, 1, 2, 3, 4, 5, 6, 7,
4040            8, 9, 10, 11, 12, 13, 14, 15,
4041        );
4042        assert_eq_m128i(r, e);
4043    }
4044
4045    #[simd_test(enable = "sse2")]
4046    unsafe fn test_mm_setzero_si128() {
4047        let r = _mm_setzero_si128();
4048        assert_eq_m128i(r, _mm_set1_epi64x(0));
4049    }
4050
4051    #[simd_test(enable = "sse2")]
4052    unsafe fn test_mm_loadl_epi64() {
4053        let a = _mm_setr_epi64x(6, 5);
4054        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4055        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4056    }
4057
4058    #[simd_test(enable = "sse2")]
4059    unsafe fn test_mm_load_si128() {
4060        let a = _mm_set_epi64x(5, 6);
4061        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4062        assert_eq_m128i(a, r);
4063    }
4064
4065    #[simd_test(enable = "sse2")]
4066    unsafe fn test_mm_loadu_si128() {
4067        let a = _mm_set_epi64x(5, 6);
4068        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4069        assert_eq_m128i(a, r);
4070    }
4071
4072    #[simd_test(enable = "sse2")]
4073    // Miri cannot support this until it is clear how it fits in the Rust memory model
4074    // (non-temporal store)
4075    #[cfg_attr(miri, ignore)]
4076    unsafe fn test_mm_maskmoveu_si128() {
4077        let a = _mm_set1_epi8(9);
4078        #[rustfmt::skip]
4079        let mask = _mm_set_epi8(
4080            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4081            0, 0, 0, 0, 0, 0, 0, 0,
4082        );
4083        let mut r = _mm_set1_epi8(0);
4084        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4085        _mm_sfence();
4086        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4087        assert_eq_m128i(r, e);
4088    }
4089
4090    #[simd_test(enable = "sse2")]
4091    unsafe fn test_mm_store_si128() {
4092        let a = _mm_set1_epi8(9);
4093        let mut r = _mm_set1_epi8(0);
4094        _mm_store_si128(&mut r, a);
4095        assert_eq_m128i(r, a);
4096    }
4097
4098    #[simd_test(enable = "sse2")]
4099    unsafe fn test_mm_storeu_si128() {
4100        let a = _mm_set1_epi8(9);
4101        let mut r = _mm_set1_epi8(0);
4102        _mm_storeu_si128(&mut r, a);
4103        assert_eq_m128i(r, a);
4104    }
4105
4106    #[simd_test(enable = "sse2")]
4107    unsafe fn test_mm_storel_epi64() {
4108        let a = _mm_setr_epi64x(2, 9);
4109        let mut r = _mm_set1_epi8(0);
4110        _mm_storel_epi64(&mut r, a);
4111        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4112    }
4113
4114    #[simd_test(enable = "sse2")]
4115    // Miri cannot support this until it is clear how it fits in the Rust memory model
4116    // (non-temporal store)
4117    #[cfg_attr(miri, ignore)]
4118    unsafe fn test_mm_stream_si128() {
4119        let a = _mm_setr_epi32(1, 2, 3, 4);
4120        let mut r = _mm_undefined_si128();
4121        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4122        _mm_sfence();
4123        assert_eq_m128i(r, a);
4124    }
4125
4126    #[simd_test(enable = "sse2")]
4127    // Miri cannot support this until it is clear how it fits in the Rust memory model
4128    // (non-temporal store)
4129    #[cfg_attr(miri, ignore)]
4130    unsafe fn test_mm_stream_si32() {
4131        let a: i32 = 7;
4132        let mut mem = boxed::Box::<i32>::new(-1);
4133        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4134        _mm_sfence();
4135        assert_eq!(a, *mem);
4136    }
4137
4138    #[simd_test(enable = "sse2")]
4139    unsafe fn test_mm_move_epi64() {
4140        let a = _mm_setr_epi64x(5, 6);
4141        let r = _mm_move_epi64(a);
4142        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4143    }
4144
4145    #[simd_test(enable = "sse2")]
4146    unsafe fn test_mm_packs_epi16() {
4147        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4148        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4149        let r = _mm_packs_epi16(a, b);
4150        #[rustfmt::skip]
4151        assert_eq_m128i(
4152            r,
4153            _mm_setr_epi8(
4154                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4155            )
4156        );
4157    }
4158
4159    #[simd_test(enable = "sse2")]
4160    unsafe fn test_mm_packs_epi32() {
4161        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4162        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4163        let r = _mm_packs_epi32(a, b);
4164        assert_eq_m128i(
4165            r,
4166            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4167        );
4168    }
4169
4170    #[simd_test(enable = "sse2")]
4171    unsafe fn test_mm_packus_epi16() {
4172        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4173        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4174        let r = _mm_packus_epi16(a, b);
4175        assert_eq_m128i(
4176            r,
4177            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4178        );
4179    }
4180
4181    #[simd_test(enable = "sse2")]
4182    unsafe fn test_mm_extract_epi16() {
4183        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4184        let r1 = _mm_extract_epi16::<0>(a);
4185        let r2 = _mm_extract_epi16::<3>(a);
4186        assert_eq!(r1, 0xFFFF);
4187        assert_eq!(r2, 3);
4188    }
4189
4190    #[simd_test(enable = "sse2")]
4191    unsafe fn test_mm_insert_epi16() {
4192        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4193        let r = _mm_insert_epi16::<0>(a, 9);
4194        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4195        assert_eq_m128i(r, e);
4196    }
4197
4198    #[simd_test(enable = "sse2")]
4199    unsafe fn test_mm_movemask_epi8() {
4200        #[rustfmt::skip]
4201        let a = _mm_setr_epi8(
4202            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4203            0b0101, 0b1111_0000u8 as i8, 0, 0,
4204            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4205            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4206        );
4207        let r = _mm_movemask_epi8(a);
4208        assert_eq!(r, 0b10100110_00100101);
4209    }
4210
4211    #[simd_test(enable = "sse2")]
4212    unsafe fn test_mm_shuffle_epi32() {
4213        let a = _mm_setr_epi32(5, 10, 15, 20);
4214        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4215        let e = _mm_setr_epi32(20, 10, 10, 5);
4216        assert_eq_m128i(r, e);
4217    }
4218
4219    #[simd_test(enable = "sse2")]
4220    unsafe fn test_mm_shufflehi_epi16() {
4221        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4222        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4223        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4224        assert_eq_m128i(r, e);
4225    }
4226
4227    #[simd_test(enable = "sse2")]
4228    unsafe fn test_mm_shufflelo_epi16() {
4229        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4230        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4231        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4232        assert_eq_m128i(r, e);
4233    }
4234
4235    #[simd_test(enable = "sse2")]
4236    unsafe fn test_mm_unpackhi_epi8() {
4237        #[rustfmt::skip]
4238        let a = _mm_setr_epi8(
4239            0, 1, 2, 3, 4, 5, 6, 7,
4240            8, 9, 10, 11, 12, 13, 14, 15,
4241        );
4242        #[rustfmt::skip]
4243        let b = _mm_setr_epi8(
4244            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4245        );
4246        let r = _mm_unpackhi_epi8(a, b);
4247        #[rustfmt::skip]
4248        let e = _mm_setr_epi8(
4249            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4250        );
4251        assert_eq_m128i(r, e);
4252    }
4253
4254    #[simd_test(enable = "sse2")]
4255    unsafe fn test_mm_unpackhi_epi16() {
4256        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4257        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4258        let r = _mm_unpackhi_epi16(a, b);
4259        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4260        assert_eq_m128i(r, e);
4261    }
4262
4263    #[simd_test(enable = "sse2")]
4264    unsafe fn test_mm_unpackhi_epi32() {
4265        let a = _mm_setr_epi32(0, 1, 2, 3);
4266        let b = _mm_setr_epi32(4, 5, 6, 7);
4267        let r = _mm_unpackhi_epi32(a, b);
4268        let e = _mm_setr_epi32(2, 6, 3, 7);
4269        assert_eq_m128i(r, e);
4270    }
4271
4272    #[simd_test(enable = "sse2")]
4273    unsafe fn test_mm_unpackhi_epi64() {
4274        let a = _mm_setr_epi64x(0, 1);
4275        let b = _mm_setr_epi64x(2, 3);
4276        let r = _mm_unpackhi_epi64(a, b);
4277        let e = _mm_setr_epi64x(1, 3);
4278        assert_eq_m128i(r, e);
4279    }
4280
4281    #[simd_test(enable = "sse2")]
4282    unsafe fn test_mm_unpacklo_epi8() {
4283        #[rustfmt::skip]
4284        let a = _mm_setr_epi8(
4285            0, 1, 2, 3, 4, 5, 6, 7,
4286            8, 9, 10, 11, 12, 13, 14, 15,
4287        );
4288        #[rustfmt::skip]
4289        let b = _mm_setr_epi8(
4290            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4291        );
4292        let r = _mm_unpacklo_epi8(a, b);
4293        #[rustfmt::skip]
4294        let e = _mm_setr_epi8(
4295            0, 16, 1, 17, 2, 18, 3, 19,
4296            4, 20, 5, 21, 6, 22, 7, 23,
4297        );
4298        assert_eq_m128i(r, e);
4299    }
4300
4301    #[simd_test(enable = "sse2")]
4302    unsafe fn test_mm_unpacklo_epi16() {
4303        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4304        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4305        let r = _mm_unpacklo_epi16(a, b);
4306        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4307        assert_eq_m128i(r, e);
4308    }
4309
4310    #[simd_test(enable = "sse2")]
4311    unsafe fn test_mm_unpacklo_epi32() {
4312        let a = _mm_setr_epi32(0, 1, 2, 3);
4313        let b = _mm_setr_epi32(4, 5, 6, 7);
4314        let r = _mm_unpacklo_epi32(a, b);
4315        let e = _mm_setr_epi32(0, 4, 1, 5);
4316        assert_eq_m128i(r, e);
4317    }
4318
4319    #[simd_test(enable = "sse2")]
4320    unsafe fn test_mm_unpacklo_epi64() {
4321        let a = _mm_setr_epi64x(0, 1);
4322        let b = _mm_setr_epi64x(2, 3);
4323        let r = _mm_unpacklo_epi64(a, b);
4324        let e = _mm_setr_epi64x(0, 2);
4325        assert_eq_m128i(r, e);
4326    }
4327
4328    #[simd_test(enable = "sse2")]
4329    unsafe fn test_mm_add_sd() {
4330        let a = _mm_setr_pd(1.0, 2.0);
4331        let b = _mm_setr_pd(5.0, 10.0);
4332        let r = _mm_add_sd(a, b);
4333        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4334    }
4335
4336    #[simd_test(enable = "sse2")]
4337    unsafe fn test_mm_add_pd() {
4338        let a = _mm_setr_pd(1.0, 2.0);
4339        let b = _mm_setr_pd(5.0, 10.0);
4340        let r = _mm_add_pd(a, b);
4341        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4342    }
4343
4344    #[simd_test(enable = "sse2")]
4345    unsafe fn test_mm_div_sd() {
4346        let a = _mm_setr_pd(1.0, 2.0);
4347        let b = _mm_setr_pd(5.0, 10.0);
4348        let r = _mm_div_sd(a, b);
4349        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4350    }
4351
4352    #[simd_test(enable = "sse2")]
4353    unsafe fn test_mm_div_pd() {
4354        let a = _mm_setr_pd(1.0, 2.0);
4355        let b = _mm_setr_pd(5.0, 10.0);
4356        let r = _mm_div_pd(a, b);
4357        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4358    }
4359
4360    #[simd_test(enable = "sse2")]
4361    unsafe fn test_mm_max_sd() {
4362        let a = _mm_setr_pd(1.0, 2.0);
4363        let b = _mm_setr_pd(5.0, 10.0);
4364        let r = _mm_max_sd(a, b);
4365        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4366    }
4367
4368    #[simd_test(enable = "sse2")]
4369    unsafe fn test_mm_max_pd() {
4370        let a = _mm_setr_pd(1.0, 2.0);
4371        let b = _mm_setr_pd(5.0, 10.0);
4372        let r = _mm_max_pd(a, b);
4373        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4374
4375        // Check SSE(2)-specific semantics for -0.0 handling.
4376        let a = _mm_setr_pd(-0.0, 0.0);
4377        let b = _mm_setr_pd(0.0, 0.0);
4378        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4379        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4380        let a: [u8; 16] = transmute(a);
4381        let b: [u8; 16] = transmute(b);
4382        assert_eq!(r1, b);
4383        assert_eq!(r2, a);
4384        assert_ne!(a, b); // sanity check that -0.0 is actually present
4385    }
4386
4387    #[simd_test(enable = "sse2")]
4388    unsafe fn test_mm_min_sd() {
4389        let a = _mm_setr_pd(1.0, 2.0);
4390        let b = _mm_setr_pd(5.0, 10.0);
4391        let r = _mm_min_sd(a, b);
4392        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4393    }
4394
4395    #[simd_test(enable = "sse2")]
4396    unsafe fn test_mm_min_pd() {
4397        let a = _mm_setr_pd(1.0, 2.0);
4398        let b = _mm_setr_pd(5.0, 10.0);
4399        let r = _mm_min_pd(a, b);
4400        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4401
4402        // Check SSE(2)-specific semantics for -0.0 handling.
4403        let a = _mm_setr_pd(-0.0, 0.0);
4404        let b = _mm_setr_pd(0.0, 0.0);
4405        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4406        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4407        let a: [u8; 16] = transmute(a);
4408        let b: [u8; 16] = transmute(b);
4409        assert_eq!(r1, b);
4410        assert_eq!(r2, a);
4411        assert_ne!(a, b); // sanity check that -0.0 is actually present
4412    }
4413
4414    #[simd_test(enable = "sse2")]
4415    unsafe fn test_mm_mul_sd() {
4416        let a = _mm_setr_pd(1.0, 2.0);
4417        let b = _mm_setr_pd(5.0, 10.0);
4418        let r = _mm_mul_sd(a, b);
4419        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4420    }
4421
4422    #[simd_test(enable = "sse2")]
4423    unsafe fn test_mm_mul_pd() {
4424        let a = _mm_setr_pd(1.0, 2.0);
4425        let b = _mm_setr_pd(5.0, 10.0);
4426        let r = _mm_mul_pd(a, b);
4427        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4428    }
4429
4430    #[simd_test(enable = "sse2")]
4431    unsafe fn test_mm_sqrt_sd() {
4432        let a = _mm_setr_pd(1.0, 2.0);
4433        let b = _mm_setr_pd(5.0, 10.0);
4434        let r = _mm_sqrt_sd(a, b);
4435        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4436    }
4437
4438    #[simd_test(enable = "sse2")]
4439    unsafe fn test_mm_sqrt_pd() {
4440        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4441        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4442    }
4443
4444    #[simd_test(enable = "sse2")]
4445    unsafe fn test_mm_sub_sd() {
4446        let a = _mm_setr_pd(1.0, 2.0);
4447        let b = _mm_setr_pd(5.0, 10.0);
4448        let r = _mm_sub_sd(a, b);
4449        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4450    }
4451
4452    #[simd_test(enable = "sse2")]
4453    unsafe fn test_mm_sub_pd() {
4454        let a = _mm_setr_pd(1.0, 2.0);
4455        let b = _mm_setr_pd(5.0, 10.0);
4456        let r = _mm_sub_pd(a, b);
4457        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4458    }
4459
4460    #[simd_test(enable = "sse2")]
4461    unsafe fn test_mm_and_pd() {
4462        let a = transmute(u64x2::splat(5));
4463        let b = transmute(u64x2::splat(3));
4464        let r = _mm_and_pd(a, b);
4465        let e = transmute(u64x2::splat(1));
4466        assert_eq_m128d(r, e);
4467    }
4468
4469    #[simd_test(enable = "sse2")]
4470    unsafe fn test_mm_andnot_pd() {
4471        let a = transmute(u64x2::splat(5));
4472        let b = transmute(u64x2::splat(3));
4473        let r = _mm_andnot_pd(a, b);
4474        let e = transmute(u64x2::splat(2));
4475        assert_eq_m128d(r, e);
4476    }
4477
4478    #[simd_test(enable = "sse2")]
4479    unsafe fn test_mm_or_pd() {
4480        let a = transmute(u64x2::splat(5));
4481        let b = transmute(u64x2::splat(3));
4482        let r = _mm_or_pd(a, b);
4483        let e = transmute(u64x2::splat(7));
4484        assert_eq_m128d(r, e);
4485    }
4486
4487    #[simd_test(enable = "sse2")]
4488    unsafe fn test_mm_xor_pd() {
4489        let a = transmute(u64x2::splat(5));
4490        let b = transmute(u64x2::splat(3));
4491        let r = _mm_xor_pd(a, b);
4492        let e = transmute(u64x2::splat(6));
4493        assert_eq_m128d(r, e);
4494    }
4495
4496    #[simd_test(enable = "sse2")]
4497    unsafe fn test_mm_cmpeq_sd() {
4498        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4499        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4500        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4501        assert_eq_m128i(r, e);
4502    }
4503
4504    #[simd_test(enable = "sse2")]
4505    unsafe fn test_mm_cmplt_sd() {
4506        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4507        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4508        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4509        assert_eq_m128i(r, e);
4510    }
4511
4512    #[simd_test(enable = "sse2")]
4513    unsafe fn test_mm_cmple_sd() {
4514        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4515        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4516        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4517        assert_eq_m128i(r, e);
4518    }
4519
4520    #[simd_test(enable = "sse2")]
4521    unsafe fn test_mm_cmpgt_sd() {
4522        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4523        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4524        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4525        assert_eq_m128i(r, e);
4526    }
4527
4528    #[simd_test(enable = "sse2")]
4529    unsafe fn test_mm_cmpge_sd() {
4530        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4531        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4532        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4533        assert_eq_m128i(r, e);
4534    }
4535
4536    #[simd_test(enable = "sse2")]
4537    unsafe fn test_mm_cmpord_sd() {
4538        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4539        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4540        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4541        assert_eq_m128i(r, e);
4542    }
4543
4544    #[simd_test(enable = "sse2")]
4545    unsafe fn test_mm_cmpunord_sd() {
4546        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4547        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4548        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4549        assert_eq_m128i(r, e);
4550    }
4551
4552    #[simd_test(enable = "sse2")]
4553    unsafe fn test_mm_cmpneq_sd() {
4554        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4555        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4556        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4557        assert_eq_m128i(r, e);
4558    }
4559
4560    #[simd_test(enable = "sse2")]
4561    unsafe fn test_mm_cmpnlt_sd() {
4562        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4563        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4564        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4565        assert_eq_m128i(r, e);
4566    }
4567
4568    #[simd_test(enable = "sse2")]
4569    unsafe fn test_mm_cmpnle_sd() {
4570        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4571        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4572        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4573        assert_eq_m128i(r, e);
4574    }
4575
4576    #[simd_test(enable = "sse2")]
4577    unsafe fn test_mm_cmpngt_sd() {
4578        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4579        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4580        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4581        assert_eq_m128i(r, e);
4582    }
4583
4584    #[simd_test(enable = "sse2")]
4585    unsafe fn test_mm_cmpnge_sd() {
4586        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4587        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4588        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4589        assert_eq_m128i(r, e);
4590    }
4591
4592    #[simd_test(enable = "sse2")]
4593    unsafe fn test_mm_cmpeq_pd() {
4594        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4595        let e = _mm_setr_epi64x(!0, 0);
4596        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4597        assert_eq_m128i(r, e);
4598    }
4599
4600    #[simd_test(enable = "sse2")]
4601    unsafe fn test_mm_cmplt_pd() {
4602        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4603        let e = _mm_setr_epi64x(0, !0);
4604        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4605        assert_eq_m128i(r, e);
4606    }
4607
4608    #[simd_test(enable = "sse2")]
4609    unsafe fn test_mm_cmple_pd() {
4610        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4611        let e = _mm_setr_epi64x(!0, !0);
4612        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4613        assert_eq_m128i(r, e);
4614    }
4615
4616    #[simd_test(enable = "sse2")]
4617    unsafe fn test_mm_cmpgt_pd() {
4618        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4619        let e = _mm_setr_epi64x(0, 0);
4620        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4621        assert_eq_m128i(r, e);
4622    }
4623
4624    #[simd_test(enable = "sse2")]
4625    unsafe fn test_mm_cmpge_pd() {
4626        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4627        let e = _mm_setr_epi64x(!0, 0);
4628        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4629        assert_eq_m128i(r, e);
4630    }
4631
4632    #[simd_test(enable = "sse2")]
4633    unsafe fn test_mm_cmpord_pd() {
4634        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4635        let e = _mm_setr_epi64x(0, !0);
4636        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4637        assert_eq_m128i(r, e);
4638    }
4639
4640    #[simd_test(enable = "sse2")]
4641    unsafe fn test_mm_cmpunord_pd() {
4642        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4643        let e = _mm_setr_epi64x(!0, 0);
4644        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4645        assert_eq_m128i(r, e);
4646    }
4647
4648    #[simd_test(enable = "sse2")]
4649    unsafe fn test_mm_cmpneq_pd() {
4650        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4651        let e = _mm_setr_epi64x(!0, !0);
4652        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4653        assert_eq_m128i(r, e);
4654    }
4655
4656    #[simd_test(enable = "sse2")]
4657    unsafe fn test_mm_cmpnlt_pd() {
4658        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4659        let e = _mm_setr_epi64x(0, 0);
4660        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4661        assert_eq_m128i(r, e);
4662    }
4663
4664    #[simd_test(enable = "sse2")]
4665    unsafe fn test_mm_cmpnle_pd() {
4666        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4667        let e = _mm_setr_epi64x(0, 0);
4668        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4669        assert_eq_m128i(r, e);
4670    }
4671
4672    #[simd_test(enable = "sse2")]
4673    unsafe fn test_mm_cmpngt_pd() {
4674        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4675        let e = _mm_setr_epi64x(0, !0);
4676        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4677        assert_eq_m128i(r, e);
4678    }
4679
4680    #[simd_test(enable = "sse2")]
4681    unsafe fn test_mm_cmpnge_pd() {
4682        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4683        let e = _mm_setr_epi64x(0, !0);
4684        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4685        assert_eq_m128i(r, e);
4686    }
4687
4688    #[simd_test(enable = "sse2")]
4689    unsafe fn test_mm_comieq_sd() {
4690        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4691        assert!(_mm_comieq_sd(a, b) != 0);
4692
4693        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4694        assert!(_mm_comieq_sd(a, b) == 0);
4695    }
4696
4697    #[simd_test(enable = "sse2")]
4698    unsafe fn test_mm_comilt_sd() {
4699        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4700        assert!(_mm_comilt_sd(a, b) == 0);
4701    }
4702
4703    #[simd_test(enable = "sse2")]
4704    unsafe fn test_mm_comile_sd() {
4705        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4706        assert!(_mm_comile_sd(a, b) != 0);
4707    }
4708
4709    #[simd_test(enable = "sse2")]
4710    unsafe fn test_mm_comigt_sd() {
4711        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4712        assert!(_mm_comigt_sd(a, b) == 0);
4713    }
4714
4715    #[simd_test(enable = "sse2")]
4716    unsafe fn test_mm_comige_sd() {
4717        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4718        assert!(_mm_comige_sd(a, b) != 0);
4719    }
4720
4721    #[simd_test(enable = "sse2")]
4722    unsafe fn test_mm_comineq_sd() {
4723        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4724        assert!(_mm_comineq_sd(a, b) == 0);
4725    }
4726
4727    #[simd_test(enable = "sse2")]
4728    unsafe fn test_mm_ucomieq_sd() {
4729        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4730        assert!(_mm_ucomieq_sd(a, b) != 0);
4731
4732        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4733        assert!(_mm_ucomieq_sd(a, b) == 0);
4734    }
4735
4736    #[simd_test(enable = "sse2")]
4737    unsafe fn test_mm_ucomilt_sd() {
4738        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4739        assert!(_mm_ucomilt_sd(a, b) == 0);
4740    }
4741
4742    #[simd_test(enable = "sse2")]
4743    unsafe fn test_mm_ucomile_sd() {
4744        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4745        assert!(_mm_ucomile_sd(a, b) != 0);
4746    }
4747
4748    #[simd_test(enable = "sse2")]
4749    unsafe fn test_mm_ucomigt_sd() {
4750        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4751        assert!(_mm_ucomigt_sd(a, b) == 0);
4752    }
4753
4754    #[simd_test(enable = "sse2")]
4755    unsafe fn test_mm_ucomige_sd() {
4756        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4757        assert!(_mm_ucomige_sd(a, b) != 0);
4758    }
4759
4760    #[simd_test(enable = "sse2")]
4761    unsafe fn test_mm_ucomineq_sd() {
4762        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4763        assert!(_mm_ucomineq_sd(a, b) == 0);
4764    }
4765
4766    #[simd_test(enable = "sse2")]
4767    unsafe fn test_mm_movemask_pd() {
4768        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4769        assert_eq!(r, 0b01);
4770
4771        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4772        assert_eq!(r, 0b11);
4773    }
4774
4775    #[repr(align(16))]
4776    struct Memory {
4777        data: [f64; 4],
4778    }
4779
4780    #[simd_test(enable = "sse2")]
4781    unsafe fn test_mm_load_pd() {
4782        let mem = Memory {
4783            data: [1.0f64, 2.0, 3.0, 4.0],
4784        };
4785        let vals = &mem.data;
4786        let d = vals.as_ptr();
4787
4788        let r = _mm_load_pd(d);
4789        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4790    }
4791
4792    #[simd_test(enable = "sse2")]
4793    unsafe fn test_mm_load_sd() {
4794        let a = 1.;
4795        let expected = _mm_setr_pd(a, 0.);
4796        let r = _mm_load_sd(&a);
4797        assert_eq_m128d(r, expected);
4798    }
4799
4800    #[simd_test(enable = "sse2")]
4801    unsafe fn test_mm_loadh_pd() {
4802        let a = _mm_setr_pd(1., 2.);
4803        let b = 3.;
4804        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4805        let r = _mm_loadh_pd(a, &b);
4806        assert_eq_m128d(r, expected);
4807    }
4808
4809    #[simd_test(enable = "sse2")]
4810    unsafe fn test_mm_loadl_pd() {
4811        let a = _mm_setr_pd(1., 2.);
4812        let b = 3.;
4813        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4814        let r = _mm_loadl_pd(a, &b);
4815        assert_eq_m128d(r, expected);
4816    }
4817
4818    #[simd_test(enable = "sse2")]
4819    // Miri cannot support this until it is clear how it fits in the Rust memory model
4820    // (non-temporal store)
4821    #[cfg_attr(miri, ignore)]
4822    unsafe fn test_mm_stream_pd() {
4823        #[repr(align(128))]
4824        struct Memory {
4825            pub data: [f64; 2],
4826        }
4827        let a = _mm_set1_pd(7.0);
4828        let mut mem = Memory { data: [-1.0; 2] };
4829
4830        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4831        _mm_sfence();
4832        for i in 0..2 {
4833            assert_eq!(mem.data[i], get_m128d(a, i));
4834        }
4835    }
4836
4837    #[simd_test(enable = "sse2")]
4838    unsafe fn test_mm_store_sd() {
4839        let mut dest = 0.;
4840        let a = _mm_setr_pd(1., 2.);
4841        _mm_store_sd(&mut dest, a);
4842        assert_eq!(dest, _mm_cvtsd_f64(a));
4843    }
4844
4845    #[simd_test(enable = "sse2")]
4846    unsafe fn test_mm_store_pd() {
4847        let mut mem = Memory { data: [0.0f64; 4] };
4848        let vals = &mut mem.data;
4849        let a = _mm_setr_pd(1.0, 2.0);
4850        let d = vals.as_mut_ptr();
4851
4852        _mm_store_pd(d, *black_box(&a));
4853        assert_eq!(vals[0], 1.0);
4854        assert_eq!(vals[1], 2.0);
4855    }
4856
4857    #[simd_test(enable = "sse2")]
4858    unsafe fn test_mm_storeu_pd() {
4859        let mut mem = Memory { data: [0.0f64; 4] };
4860        let vals = &mut mem.data;
4861        let a = _mm_setr_pd(1.0, 2.0);
4862
4863        let mut ofs = 0;
4864        let mut p = vals.as_mut_ptr();
4865
4866        // Make sure p is **not** aligned to 16-byte boundary
4867        if (p as usize) & 0xf == 0 {
4868            ofs = 1;
4869            p = p.add(1);
4870        }
4871
4872        _mm_storeu_pd(p, *black_box(&a));
4873
4874        if ofs > 0 {
4875            assert_eq!(vals[ofs - 1], 0.0);
4876        }
4877        assert_eq!(vals[ofs + 0], 1.0);
4878        assert_eq!(vals[ofs + 1], 2.0);
4879    }
4880
4881    #[simd_test(enable = "sse2")]
4882    unsafe fn test_mm_storeu_si16() {
4883        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4884        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4885        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4886        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4887        assert_eq_m128i(r, e);
4888    }
4889
4890    #[simd_test(enable = "sse2")]
4891    unsafe fn test_mm_storeu_si32() {
4892        let a = _mm_setr_epi32(1, 2, 3, 4);
4893        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4894        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4895        let e = _mm_setr_epi32(1, 6, 7, 8);
4896        assert_eq_m128i(r, e);
4897    }
4898
4899    #[simd_test(enable = "sse2")]
4900    unsafe fn test_mm_storeu_si64() {
4901        let a = _mm_setr_epi64x(1, 2);
4902        let mut r = _mm_setr_epi64x(3, 4);
4903        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4904        let e = _mm_setr_epi64x(1, 4);
4905        assert_eq_m128i(r, e);
4906    }
4907
4908    #[simd_test(enable = "sse2")]
4909    unsafe fn test_mm_store1_pd() {
4910        let mut mem = Memory { data: [0.0f64; 4] };
4911        let vals = &mut mem.data;
4912        let a = _mm_setr_pd(1.0, 2.0);
4913        let d = vals.as_mut_ptr();
4914
4915        _mm_store1_pd(d, *black_box(&a));
4916        assert_eq!(vals[0], 1.0);
4917        assert_eq!(vals[1], 1.0);
4918    }
4919
4920    #[simd_test(enable = "sse2")]
4921    unsafe fn test_mm_store_pd1() {
4922        let mut mem = Memory { data: [0.0f64; 4] };
4923        let vals = &mut mem.data;
4924        let a = _mm_setr_pd(1.0, 2.0);
4925        let d = vals.as_mut_ptr();
4926
4927        _mm_store_pd1(d, *black_box(&a));
4928        assert_eq!(vals[0], 1.0);
4929        assert_eq!(vals[1], 1.0);
4930    }
4931
4932    #[simd_test(enable = "sse2")]
4933    unsafe fn test_mm_storer_pd() {
4934        let mut mem = Memory { data: [0.0f64; 4] };
4935        let vals = &mut mem.data;
4936        let a = _mm_setr_pd(1.0, 2.0);
4937        let d = vals.as_mut_ptr();
4938
4939        _mm_storer_pd(d, *black_box(&a));
4940        assert_eq!(vals[0], 2.0);
4941        assert_eq!(vals[1], 1.0);
4942    }
4943
4944    #[simd_test(enable = "sse2")]
4945    unsafe fn test_mm_storeh_pd() {
4946        let mut dest = 0.;
4947        let a = _mm_setr_pd(1., 2.);
4948        _mm_storeh_pd(&mut dest, a);
4949        assert_eq!(dest, get_m128d(a, 1));
4950    }
4951
4952    #[simd_test(enable = "sse2")]
4953    unsafe fn test_mm_storel_pd() {
4954        let mut dest = 0.;
4955        let a = _mm_setr_pd(1., 2.);
4956        _mm_storel_pd(&mut dest, a);
4957        assert_eq!(dest, _mm_cvtsd_f64(a));
4958    }
4959
4960    #[simd_test(enable = "sse2")]
4961    unsafe fn test_mm_loadr_pd() {
4962        let mut mem = Memory {
4963            data: [1.0f64, 2.0, 3.0, 4.0],
4964        };
4965        let vals = &mut mem.data;
4966        let d = vals.as_ptr();
4967
4968        let r = _mm_loadr_pd(d);
4969        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4970    }
4971
4972    #[simd_test(enable = "sse2")]
4973    unsafe fn test_mm_loadu_pd() {
4974        let mut mem = Memory {
4975            data: [1.0f64, 2.0, 3.0, 4.0],
4976        };
4977        let vals = &mut mem.data;
4978        let mut d = vals.as_ptr();
4979
4980        // make sure d is not aligned to 16-byte boundary
4981        let mut offset = 0;
4982        if (d as usize) & 0xf == 0 {
4983            offset = 1;
4984            d = d.add(offset);
4985        }
4986
4987        let r = _mm_loadu_pd(d);
4988        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4989        assert_eq_m128d(r, e);
4990    }
4991
4992    #[simd_test(enable = "sse2")]
4993    unsafe fn test_mm_loadu_si16() {
4994        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4995        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4996        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4997    }
4998
4999    #[simd_test(enable = "sse2")]
5000    unsafe fn test_mm_loadu_si32() {
5001        let a = _mm_setr_epi32(1, 2, 3, 4);
5002        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
5003        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
5004    }
5005
5006    #[simd_test(enable = "sse2")]
5007    unsafe fn test_mm_loadu_si64() {
5008        let a = _mm_setr_epi64x(5, 6);
5009        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
5010        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5011    }
5012
5013    #[simd_test(enable = "sse2")]
5014    unsafe fn test_mm_cvtpd_ps() {
5015        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
5016        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
5017
5018        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
5019        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
5020
5021        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
5022        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
5023
5024        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
5025        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
5026    }
5027
5028    #[simd_test(enable = "sse2")]
5029    unsafe fn test_mm_cvtps_pd() {
5030        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5031        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5032
5033        let r = _mm_cvtps_pd(_mm_setr_ps(
5034            f32::MAX,
5035            f32::INFINITY,
5036            f32::NEG_INFINITY,
5037            f32::MIN,
5038        ));
5039        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5040    }
5041
5042    #[simd_test(enable = "sse2")]
5043    unsafe fn test_mm_cvtpd_epi32() {
5044        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5045        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5046
5047        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5048        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5049
5050        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5051        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5052
5053        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5054        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5055
5056        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5057        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5058    }
5059
5060    #[simd_test(enable = "sse2")]
5061    unsafe fn test_mm_cvtsd_si32() {
5062        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5063        assert_eq!(r, -2);
5064
5065        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5066        assert_eq!(r, i32::MIN);
5067
5068        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5069        assert_eq!(r, i32::MIN);
5070    }
5071
5072    #[simd_test(enable = "sse2")]
5073    unsafe fn test_mm_cvtsd_ss() {
5074        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5075        let b = _mm_setr_pd(2.0, -5.0);
5076
5077        let r = _mm_cvtsd_ss(a, b);
5078
5079        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5080
5081        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5082        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5083
5084        let r = _mm_cvtsd_ss(a, b);
5085
5086        assert_eq_m128(
5087            r,
5088            _mm_setr_ps(
5089                f32::INFINITY,
5090                f32::NEG_INFINITY,
5091                f32::MAX,
5092                f32::NEG_INFINITY,
5093            ),
5094        );
5095    }
5096
5097    #[simd_test(enable = "sse2")]
5098    unsafe fn test_mm_cvtsd_f64() {
5099        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5100        assert_eq!(r, -1.1);
5101    }
5102
5103    #[simd_test(enable = "sse2")]
5104    unsafe fn test_mm_cvtss_sd() {
5105        let a = _mm_setr_pd(-1.1, 2.2);
5106        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5107
5108        let r = _mm_cvtss_sd(a, b);
5109        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5110
5111        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5112        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5113
5114        let r = _mm_cvtss_sd(a, b);
5115        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5116    }
5117
5118    #[simd_test(enable = "sse2")]
5119    unsafe fn test_mm_cvttpd_epi32() {
5120        let a = _mm_setr_pd(-1.1, 2.2);
5121        let r = _mm_cvttpd_epi32(a);
5122        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5123
5124        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5125        let r = _mm_cvttpd_epi32(a);
5126        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5127    }
5128
5129    #[simd_test(enable = "sse2")]
5130    unsafe fn test_mm_cvttsd_si32() {
5131        let a = _mm_setr_pd(-1.1, 2.2);
5132        let r = _mm_cvttsd_si32(a);
5133        assert_eq!(r, -1);
5134
5135        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5136        let r = _mm_cvttsd_si32(a);
5137        assert_eq!(r, i32::MIN);
5138    }
5139
5140    #[simd_test(enable = "sse2")]
5141    unsafe fn test_mm_cvttps_epi32() {
5142        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5143        let r = _mm_cvttps_epi32(a);
5144        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5145
5146        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5147        let r = _mm_cvttps_epi32(a);
5148        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5149    }
5150
5151    #[simd_test(enable = "sse2")]
5152    unsafe fn test_mm_set_sd() {
5153        let r = _mm_set_sd(-1.0_f64);
5154        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5155    }
5156
5157    #[simd_test(enable = "sse2")]
5158    unsafe fn test_mm_set1_pd() {
5159        let r = _mm_set1_pd(-1.0_f64);
5160        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5161    }
5162
5163    #[simd_test(enable = "sse2")]
5164    unsafe fn test_mm_set_pd1() {
5165        let r = _mm_set_pd1(-2.0_f64);
5166        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5167    }
5168
5169    #[simd_test(enable = "sse2")]
5170    unsafe fn test_mm_set_pd() {
5171        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5172        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5173    }
5174
5175    #[simd_test(enable = "sse2")]
5176    unsafe fn test_mm_setr_pd() {
5177        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5178        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5179    }
5180
5181    #[simd_test(enable = "sse2")]
5182    unsafe fn test_mm_setzero_pd() {
5183        let r = _mm_setzero_pd();
5184        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5185    }
5186
5187    #[simd_test(enable = "sse2")]
5188    unsafe fn test_mm_load1_pd() {
5189        let d = -5.0;
5190        let r = _mm_load1_pd(&d);
5191        assert_eq_m128d(r, _mm_setr_pd(d, d));
5192    }
5193
5194    #[simd_test(enable = "sse2")]
5195    unsafe fn test_mm_load_pd1() {
5196        let d = -5.0;
5197        let r = _mm_load_pd1(&d);
5198        assert_eq_m128d(r, _mm_setr_pd(d, d));
5199    }
5200
5201    #[simd_test(enable = "sse2")]
5202    unsafe fn test_mm_unpackhi_pd() {
5203        let a = _mm_setr_pd(1.0, 2.0);
5204        let b = _mm_setr_pd(3.0, 4.0);
5205        let r = _mm_unpackhi_pd(a, b);
5206        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5207    }
5208
5209    #[simd_test(enable = "sse2")]
5210    unsafe fn test_mm_unpacklo_pd() {
5211        let a = _mm_setr_pd(1.0, 2.0);
5212        let b = _mm_setr_pd(3.0, 4.0);
5213        let r = _mm_unpacklo_pd(a, b);
5214        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5215    }
5216
5217    #[simd_test(enable = "sse2")]
5218    unsafe fn test_mm_shuffle_pd() {
5219        let a = _mm_setr_pd(1., 2.);
5220        let b = _mm_setr_pd(3., 4.);
5221        let expected = _mm_setr_pd(1., 3.);
5222        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5223        assert_eq_m128d(r, expected);
5224    }
5225
5226    #[simd_test(enable = "sse2")]
5227    unsafe fn test_mm_move_sd() {
5228        let a = _mm_setr_pd(1., 2.);
5229        let b = _mm_setr_pd(3., 4.);
5230        let expected = _mm_setr_pd(3., 2.);
5231        let r = _mm_move_sd(a, b);
5232        assert_eq_m128d(r, expected);
5233    }
5234
5235    #[simd_test(enable = "sse2")]
5236    unsafe fn test_mm_castpd_ps() {
5237        let a = _mm_set1_pd(0.);
5238        let expected = _mm_set1_ps(0.);
5239        let r = _mm_castpd_ps(a);
5240        assert_eq_m128(r, expected);
5241    }
5242
5243    #[simd_test(enable = "sse2")]
5244    unsafe fn test_mm_castpd_si128() {
5245        let a = _mm_set1_pd(0.);
5246        let expected = _mm_set1_epi64x(0);
5247        let r = _mm_castpd_si128(a);
5248        assert_eq_m128i(r, expected);
5249    }
5250
5251    #[simd_test(enable = "sse2")]
5252    unsafe fn test_mm_castps_pd() {
5253        let a = _mm_set1_ps(0.);
5254        let expected = _mm_set1_pd(0.);
5255        let r = _mm_castps_pd(a);
5256        assert_eq_m128d(r, expected);
5257    }
5258
5259    #[simd_test(enable = "sse2")]
5260    unsafe fn test_mm_castps_si128() {
5261        let a = _mm_set1_ps(0.);
5262        let expected = _mm_set1_epi32(0);
5263        let r = _mm_castps_si128(a);
5264        assert_eq_m128i(r, expected);
5265    }
5266
5267    #[simd_test(enable = "sse2")]
5268    unsafe fn test_mm_castsi128_pd() {
5269        let a = _mm_set1_epi64x(0);
5270        let expected = _mm_set1_pd(0.);
5271        let r = _mm_castsi128_pd(a);
5272        assert_eq_m128d(r, expected);
5273    }
5274
5275    #[simd_test(enable = "sse2")]
5276    unsafe fn test_mm_castsi128_ps() {
5277        let a = _mm_set1_epi32(0);
5278        let expected = _mm_set1_ps(0.);
5279        let r = _mm_castsi128_ps(a);
5280        assert_eq_m128(r, expected);
5281    }
5282}