core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub unsafe fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    pause()
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub unsafe fn _mm_lfence() {
53    lfence()
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub unsafe fn _mm_mfence() {
69    mfence()
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
205}
206
207/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
208/// maximum values.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
211#[inline]
212#[target_feature(enable = "sse2")]
213#[cfg_attr(test, assert_instr(pmaxsw))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
216    unsafe {
217        let a = a.as_i16x8();
218        let b = b.as_i16x8();
219        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
220    }
221}
222
223/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
224/// packed maximum values.
225///
226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
227#[inline]
228#[target_feature(enable = "sse2")]
229#[cfg_attr(test, assert_instr(pmaxub))]
230#[stable(feature = "simd_x86", since = "1.27.0")]
231pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
232    unsafe {
233        let a = a.as_u8x16();
234        let b = b.as_u8x16();
235        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
236    }
237}
238
239/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
240/// minimum values.
241///
242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
243#[inline]
244#[target_feature(enable = "sse2")]
245#[cfg_attr(test, assert_instr(pminsw))]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
248    unsafe {
249        let a = a.as_i16x8();
250        let b = b.as_i16x8();
251        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
252    }
253}
254
255/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
256/// packed minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminub))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
264    unsafe {
265        let a = a.as_u8x16();
266        let b = b.as_u8x16();
267        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
268    }
269}
270
271/// Multiplies the packed 16-bit integers in `a` and `b`.
272///
273/// The multiplication produces intermediate 32-bit integers, and returns the
274/// high 16 bits of the intermediate integers.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
277#[inline]
278#[target_feature(enable = "sse2")]
279#[cfg_attr(test, assert_instr(pmulhw))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
282    unsafe {
283        let a = simd_cast::<_, i32x8>(a.as_i16x8());
284        let b = simd_cast::<_, i32x8>(b.as_i16x8());
285        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
286        transmute(simd_cast::<i32x8, i16x8>(r))
287    }
288}
289
290/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
291///
292/// The multiplication produces intermediate 32-bit integers, and returns the
293/// high 16 bits of the intermediate integers.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
296#[inline]
297#[target_feature(enable = "sse2")]
298#[cfg_attr(test, assert_instr(pmulhuw))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
301    unsafe {
302        let a = simd_cast::<_, u32x8>(a.as_u16x8());
303        let b = simd_cast::<_, u32x8>(b.as_u16x8());
304        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
305        transmute(simd_cast::<u32x8, u16x8>(r))
306    }
307}
308
309/// Multiplies the packed 16-bit integers in `a` and `b`.
310///
311/// The multiplication produces intermediate 32-bit integers, and returns the
312/// low 16 bits of the intermediate integers.
313///
314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
315#[inline]
316#[target_feature(enable = "sse2")]
317#[cfg_attr(test, assert_instr(pmullw))]
318#[stable(feature = "simd_x86", since = "1.27.0")]
319pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
320    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
321}
322
323/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
324/// in `a` and `b`.
325///
326/// Returns the unsigned 64-bit results.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
329#[inline]
330#[target_feature(enable = "sse2")]
331#[cfg_attr(test, assert_instr(pmuludq))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
334    unsafe {
335        let a = a.as_u64x2();
336        let b = b.as_u64x2();
337        let mask = u64x2::splat(u32::MAX.into());
338        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
339    }
340}
341
342/// Sum the absolute differences of packed unsigned 8-bit integers.
343///
344/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
345/// and `b`, then horizontally sum each consecutive 8 differences to produce
346/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
347/// the low 16 bits of 64-bit elements returned.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
350#[inline]
351#[target_feature(enable = "sse2")]
352#[cfg_attr(test, assert_instr(psadbw))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
355    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
356}
357
358/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
359///
360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
361#[inline]
362#[target_feature(enable = "sse2")]
363#[cfg_attr(test, assert_instr(psubb))]
364#[stable(feature = "simd_x86", since = "1.27.0")]
365pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
367}
368
369/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
372#[inline]
373#[target_feature(enable = "sse2")]
374#[cfg_attr(test, assert_instr(psubw))]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
377    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
378}
379
380/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
383#[inline]
384#[target_feature(enable = "sse2")]
385#[cfg_attr(test, assert_instr(psubd))]
386#[stable(feature = "simd_x86", since = "1.27.0")]
387pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
388    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
389}
390
391/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
394#[inline]
395#[target_feature(enable = "sse2")]
396#[cfg_attr(test, assert_instr(psubq))]
397#[stable(feature = "simd_x86", since = "1.27.0")]
398pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
399    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
400}
401
402/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
403/// using saturation.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
406#[inline]
407#[target_feature(enable = "sse2")]
408#[cfg_attr(test, assert_instr(psubsb))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
412}
413
414/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
415/// using saturation.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
418#[inline]
419#[target_feature(enable = "sse2")]
420#[cfg_attr(test, assert_instr(psubsw))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
424}
425
426/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
427/// integers in `a` using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubusb))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
435    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
436}
437
438/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
439/// integers in `a` using saturation.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
442#[inline]
443#[target_feature(enable = "sse2")]
444#[cfg_attr(test, assert_instr(psubusw))]
445#[stable(feature = "simd_x86", since = "1.27.0")]
446pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
447    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
448}
449
450/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
451///
452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
453#[inline]
454#[target_feature(enable = "sse2")]
455#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
456#[rustc_legacy_const_generics(1)]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
459    static_assert_uimm_bits!(IMM8, 8);
460    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
461}
462
463/// Implementation detail: converts the immediate argument of the
464/// `_mm_slli_si128` intrinsic into a compile-time constant.
465#[inline]
466#[target_feature(enable = "sse2")]
467unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
468    const fn mask(shift: i32, i: u32) -> u32 {
469        let shift = shift as u32 & 0xff;
470        if shift > 15 { i } else { 16 - shift + i }
471    }
472    transmute::<i8x16, _>(simd_shuffle!(
473        i8x16::ZERO,
474        a.as_i8x16(),
475        [
476            mask(IMM8, 0),
477            mask(IMM8, 1),
478            mask(IMM8, 2),
479            mask(IMM8, 3),
480            mask(IMM8, 4),
481            mask(IMM8, 5),
482            mask(IMM8, 6),
483            mask(IMM8, 7),
484            mask(IMM8, 8),
485            mask(IMM8, 9),
486            mask(IMM8, 10),
487            mask(IMM8, 11),
488            mask(IMM8, 12),
489            mask(IMM8, 13),
490            mask(IMM8, 14),
491            mask(IMM8, 15),
492        ],
493    ))
494}
495
496/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
499#[inline]
500#[target_feature(enable = "sse2")]
501#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
502#[rustc_legacy_const_generics(1)]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
505    unsafe {
506        static_assert_uimm_bits!(IMM8, 8);
507        _mm_slli_si128_impl::<IMM8>(a)
508    }
509}
510
511/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
514#[inline]
515#[target_feature(enable = "sse2")]
516#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
517#[rustc_legacy_const_generics(1)]
518#[stable(feature = "simd_x86", since = "1.27.0")]
519pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
520    unsafe {
521        static_assert_uimm_bits!(IMM8, 8);
522        _mm_srli_si128_impl::<IMM8>(a)
523    }
524}
525
526/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
532#[rustc_legacy_const_generics(1)]
533#[stable(feature = "simd_x86", since = "1.27.0")]
534pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
535    static_assert_uimm_bits!(IMM8, 8);
536    unsafe {
537        if IMM8 >= 16 {
538            _mm_setzero_si128()
539        } else {
540            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
541        }
542    }
543}
544
545/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
546/// zeros.
547///
548/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
549#[inline]
550#[target_feature(enable = "sse2")]
551#[cfg_attr(test, assert_instr(psllw))]
552#[stable(feature = "simd_x86", since = "1.27.0")]
553pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
554    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
555}
556
557/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
560#[inline]
561#[target_feature(enable = "sse2")]
562#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
563#[rustc_legacy_const_generics(1)]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
566    static_assert_uimm_bits!(IMM8, 8);
567    unsafe {
568        if IMM8 >= 32 {
569            _mm_setzero_si128()
570        } else {
571            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
572        }
573    }
574}
575
576/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
577/// zeros.
578///
579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
580#[inline]
581#[target_feature(enable = "sse2")]
582#[cfg_attr(test, assert_instr(pslld))]
583#[stable(feature = "simd_x86", since = "1.27.0")]
584pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
585    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
586}
587
588/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
589///
590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
591#[inline]
592#[target_feature(enable = "sse2")]
593#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
594#[rustc_legacy_const_generics(1)]
595#[stable(feature = "simd_x86", since = "1.27.0")]
596pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
597    static_assert_uimm_bits!(IMM8, 8);
598    unsafe {
599        if IMM8 >= 64 {
600            _mm_setzero_si128()
601        } else {
602            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
603        }
604    }
605}
606
607/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
608/// zeros.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
611#[inline]
612#[target_feature(enable = "sse2")]
613#[cfg_attr(test, assert_instr(psllq))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
616    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
617}
618
619/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
620/// bits.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
623#[inline]
624#[target_feature(enable = "sse2")]
625#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
626#[rustc_legacy_const_generics(1)]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
629    static_assert_uimm_bits!(IMM8, 8);
630    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
631}
632
633/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
634/// bits.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
637#[inline]
638#[target_feature(enable = "sse2")]
639#[cfg_attr(test, assert_instr(psraw))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
642    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
643}
644
645/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
646/// bits.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
649#[inline]
650#[target_feature(enable = "sse2")]
651#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
652#[rustc_legacy_const_generics(1)]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
655    static_assert_uimm_bits!(IMM8, 8);
656    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
657}
658
659/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
660/// bits.
661///
662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
663#[inline]
664#[target_feature(enable = "sse2")]
665#[cfg_attr(test, assert_instr(psrad))]
666#[stable(feature = "simd_x86", since = "1.27.0")]
667pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
668    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
669}
670
671/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
674#[inline]
675#[target_feature(enable = "sse2")]
676#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
677#[rustc_legacy_const_generics(1)]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
680    static_assert_uimm_bits!(IMM8, 8);
681    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
682}
683
684/// Implementation detail: converts the immediate argument of the
685/// `_mm_srli_si128` intrinsic into a compile-time constant.
686#[inline]
687#[target_feature(enable = "sse2")]
688unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
689    const fn mask(shift: i32, i: u32) -> u32 {
690        if (shift as u32) > 15 {
691            i + 16
692        } else {
693            i + (shift as u32)
694        }
695    }
696    let x: i8x16 = simd_shuffle!(
697        a.as_i8x16(),
698        i8x16::ZERO,
699        [
700            mask(IMM8, 0),
701            mask(IMM8, 1),
702            mask(IMM8, 2),
703            mask(IMM8, 3),
704            mask(IMM8, 4),
705            mask(IMM8, 5),
706            mask(IMM8, 6),
707            mask(IMM8, 7),
708            mask(IMM8, 8),
709            mask(IMM8, 9),
710            mask(IMM8, 10),
711            mask(IMM8, 11),
712            mask(IMM8, 12),
713            mask(IMM8, 13),
714            mask(IMM8, 14),
715            mask(IMM8, 15),
716        ],
717    );
718    transmute(x)
719}
720
721/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
722/// zeros.
723///
724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
725#[inline]
726#[target_feature(enable = "sse2")]
727#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
728#[rustc_legacy_const_generics(1)]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
731    static_assert_uimm_bits!(IMM8, 8);
732    unsafe {
733        if IMM8 >= 16 {
734            _mm_setzero_si128()
735        } else {
736            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
737        }
738    }
739}
740
741/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
742/// zeros.
743///
744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
745#[inline]
746#[target_feature(enable = "sse2")]
747#[cfg_attr(test, assert_instr(psrlw))]
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
750    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
751}
752
753/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
754/// zeros.
755///
756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
757#[inline]
758#[target_feature(enable = "sse2")]
759#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
760#[rustc_legacy_const_generics(1)]
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
763    static_assert_uimm_bits!(IMM8, 8);
764    unsafe {
765        if IMM8 >= 32 {
766            _mm_setzero_si128()
767        } else {
768            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
769        }
770    }
771}
772
773/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
774/// zeros.
775///
776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
777#[inline]
778#[target_feature(enable = "sse2")]
779#[cfg_attr(test, assert_instr(psrld))]
780#[stable(feature = "simd_x86", since = "1.27.0")]
781pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
782    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
783}
784
785/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
786/// zeros.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
789#[inline]
790#[target_feature(enable = "sse2")]
791#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
792#[rustc_legacy_const_generics(1)]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
795    static_assert_uimm_bits!(IMM8, 8);
796    unsafe {
797        if IMM8 >= 64 {
798            _mm_setzero_si128()
799        } else {
800            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
801        }
802    }
803}
804
805/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
806/// zeros.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
809#[inline]
810#[target_feature(enable = "sse2")]
811#[cfg_attr(test, assert_instr(psrlq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
814    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
815}
816
817/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
818/// `b`.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
821#[inline]
822#[target_feature(enable = "sse2")]
823#[cfg_attr(test, assert_instr(andps))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
826    unsafe { simd_and(a, b) }
827}
828
829/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
830/// then AND with `b`.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
833#[inline]
834#[target_feature(enable = "sse2")]
835#[cfg_attr(test, assert_instr(andnps))]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
838    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
839}
840
841/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
842/// `b`.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(orps))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
850    unsafe { simd_or(a, b) }
851}
852
853/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(xorps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
862    unsafe { simd_xor(a, b) }
863}
864
865/// Compares packed 8-bit integers in `a` and `b` for equality.
866///
867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
868#[inline]
869#[target_feature(enable = "sse2")]
870#[cfg_attr(test, assert_instr(pcmpeqb))]
871#[stable(feature = "simd_x86", since = "1.27.0")]
872pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
873    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
874}
875
876/// Compares packed 16-bit integers in `a` and `b` for equality.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
879#[inline]
880#[target_feature(enable = "sse2")]
881#[cfg_attr(test, assert_instr(pcmpeqw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
884    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
885}
886
887/// Compares packed 32-bit integers in `a` and `b` for equality.
888///
889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
890#[inline]
891#[target_feature(enable = "sse2")]
892#[cfg_attr(test, assert_instr(pcmpeqd))]
893#[stable(feature = "simd_x86", since = "1.27.0")]
894pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
895    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
896}
897
898/// Compares packed 8-bit integers in `a` and `b` for greater-than.
899///
900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
901#[inline]
902#[target_feature(enable = "sse2")]
903#[cfg_attr(test, assert_instr(pcmpgtb))]
904#[stable(feature = "simd_x86", since = "1.27.0")]
905pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
906    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
907}
908
909/// Compares packed 16-bit integers in `a` and `b` for greater-than.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
912#[inline]
913#[target_feature(enable = "sse2")]
914#[cfg_attr(test, assert_instr(pcmpgtw))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
917    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
918}
919
920/// Compares packed 32-bit integers in `a` and `b` for greater-than.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
923#[inline]
924#[target_feature(enable = "sse2")]
925#[cfg_attr(test, assert_instr(pcmpgtd))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
928    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
929}
930
931/// Compares packed 8-bit integers in `a` and `b` for less-than.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
934#[inline]
935#[target_feature(enable = "sse2")]
936#[cfg_attr(test, assert_instr(pcmpgtb))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
939    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
940}
941
942/// Compares packed 16-bit integers in `a` and `b` for less-than.
943///
944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
945#[inline]
946#[target_feature(enable = "sse2")]
947#[cfg_attr(test, assert_instr(pcmpgtw))]
948#[stable(feature = "simd_x86", since = "1.27.0")]
949pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
951}
952
953/// Compares packed 32-bit integers in `a` and `b` for less-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtd))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
961    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
962}
963
964/// Converts the lower two packed 32-bit integers in `a` to packed
965/// double-precision (64-bit) floating-point elements.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(cvtdq2pd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
973    unsafe {
974        let a = a.as_i32x4();
975        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
976    }
977}
978
979/// Returns `a` with its lower element replaced by `b` after converting it to
980/// an `f64`.
981///
982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
983#[inline]
984#[target_feature(enable = "sse2")]
985#[cfg_attr(test, assert_instr(cvtsi2sd))]
986#[stable(feature = "simd_x86", since = "1.27.0")]
987pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
988    unsafe { simd_insert!(a, 0, b as f64) }
989}
990
991/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
992/// floating-point elements.
993///
994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
995#[inline]
996#[target_feature(enable = "sse2")]
997#[cfg_attr(test, assert_instr(cvtdq2ps))]
998#[stable(feature = "simd_x86", since = "1.27.0")]
999pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1000    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1001}
1002
1003/// Converts packed single-precision (32-bit) floating-point elements in `a`
1004/// to packed 32-bit integers.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1007#[inline]
1008#[target_feature(enable = "sse2")]
1009#[cfg_attr(test, assert_instr(cvtps2dq))]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1012    unsafe { transmute(cvtps2dq(a)) }
1013}
1014
1015/// Returns a vector whose lowest element is `a` and all higher elements are
1016/// `0`.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021#[stable(feature = "simd_x86", since = "1.27.0")]
1022pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1023    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1024}
1025
1026/// Returns the lowest element of `a`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1029#[inline]
1030#[target_feature(enable = "sse2")]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1033    unsafe { simd_extract!(a.as_i32x4(), 0) }
1034}
1035
1036/// Sets packed 64-bit integers with the supplied values, from highest to
1037/// lowest.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1040#[inline]
1041#[target_feature(enable = "sse2")]
1042// no particular instruction to test
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1045    unsafe { transmute(i64x2::new(e0, e1)) }
1046}
1047
1048/// Sets packed 32-bit integers with the supplied values.
1049///
1050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1051#[inline]
1052#[target_feature(enable = "sse2")]
1053// no particular instruction to test
1054#[stable(feature = "simd_x86", since = "1.27.0")]
1055pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1056    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1057}
1058
1059/// Sets packed 16-bit integers with the supplied values.
1060///
1061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1062#[inline]
1063#[target_feature(enable = "sse2")]
1064// no particular instruction to test
1065#[stable(feature = "simd_x86", since = "1.27.0")]
1066pub fn _mm_set_epi16(
1067    e7: i16,
1068    e6: i16,
1069    e5: i16,
1070    e4: i16,
1071    e3: i16,
1072    e2: i16,
1073    e1: i16,
1074    e0: i16,
1075) -> __m128i {
1076    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1077}
1078
1079/// Sets packed 8-bit integers with the supplied values.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084// no particular instruction to test
1085#[stable(feature = "simd_x86", since = "1.27.0")]
1086pub fn _mm_set_epi8(
1087    e15: i8,
1088    e14: i8,
1089    e13: i8,
1090    e12: i8,
1091    e11: i8,
1092    e10: i8,
1093    e9: i8,
1094    e8: i8,
1095    e7: i8,
1096    e6: i8,
1097    e5: i8,
1098    e4: i8,
1099    e3: i8,
1100    e2: i8,
1101    e1: i8,
1102    e0: i8,
1103) -> __m128i {
1104    unsafe {
1105        #[rustfmt::skip]
1106        transmute(i8x16::new(
1107            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1108        ))
1109    }
1110}
1111
1112/// Broadcasts 64-bit integer `a` to all elements.
1113///
1114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1115#[inline]
1116#[target_feature(enable = "sse2")]
1117// no particular instruction to test
1118#[stable(feature = "simd_x86", since = "1.27.0")]
1119pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1120    _mm_set_epi64x(a, a)
1121}
1122
1123/// Broadcasts 32-bit integer `a` to all elements.
1124///
1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1126#[inline]
1127#[target_feature(enable = "sse2")]
1128// no particular instruction to test
1129#[stable(feature = "simd_x86", since = "1.27.0")]
1130pub fn _mm_set1_epi32(a: i32) -> __m128i {
1131    _mm_set_epi32(a, a, a, a)
1132}
1133
1134/// Broadcasts 16-bit integer `a` to all elements.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1137#[inline]
1138#[target_feature(enable = "sse2")]
1139// no particular instruction to test
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_set1_epi16(a: i16) -> __m128i {
1142    _mm_set_epi16(a, a, a, a, a, a, a, a)
1143}
1144
1145/// Broadcasts 8-bit integer `a` to all elements.
1146///
1147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1148#[inline]
1149#[target_feature(enable = "sse2")]
1150// no particular instruction to test
1151#[stable(feature = "simd_x86", since = "1.27.0")]
1152pub fn _mm_set1_epi8(a: i8) -> __m128i {
1153    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1154}
1155
1156/// Sets packed 32-bit integers with the supplied values in reverse order.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1159#[inline]
1160#[target_feature(enable = "sse2")]
1161// no particular instruction to test
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1164    _mm_set_epi32(e0, e1, e2, e3)
1165}
1166
1167/// Sets packed 16-bit integers with the supplied values in reverse order.
1168///
1169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1170#[inline]
1171#[target_feature(enable = "sse2")]
1172// no particular instruction to test
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm_setr_epi16(
1175    e7: i16,
1176    e6: i16,
1177    e5: i16,
1178    e4: i16,
1179    e3: i16,
1180    e2: i16,
1181    e1: i16,
1182    e0: i16,
1183) -> __m128i {
1184    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1185}
1186
1187/// Sets packed 8-bit integers with the supplied values in reverse order.
1188///
1189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1190#[inline]
1191#[target_feature(enable = "sse2")]
1192// no particular instruction to test
1193#[stable(feature = "simd_x86", since = "1.27.0")]
1194pub fn _mm_setr_epi8(
1195    e15: i8,
1196    e14: i8,
1197    e13: i8,
1198    e12: i8,
1199    e11: i8,
1200    e10: i8,
1201    e9: i8,
1202    e8: i8,
1203    e7: i8,
1204    e6: i8,
1205    e5: i8,
1206    e4: i8,
1207    e3: i8,
1208    e2: i8,
1209    e1: i8,
1210    e0: i8,
1211) -> __m128i {
1212    #[rustfmt::skip]
1213    _mm_set_epi8(
1214        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1215    )
1216}
1217
1218/// Returns a vector with all elements set to zero.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223#[cfg_attr(test, assert_instr(xorps))]
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225pub fn _mm_setzero_si128() -> __m128i {
1226    const { unsafe { mem::zeroed() } }
1227}
1228
1229/// Loads 64-bit integer from memory into first element of returned vector.
1230///
1231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1232#[inline]
1233#[target_feature(enable = "sse2")]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1236    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1237}
1238
1239/// Loads 128-bits of integer data from memory into a new vector.
1240///
1241/// `mem_addr` must be aligned on a 16-byte boundary.
1242///
1243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1244#[inline]
1245#[target_feature(enable = "sse2")]
1246#[cfg_attr(
1247    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1248    assert_instr(movaps)
1249)]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1252    *mem_addr
1253}
1254
1255/// Loads 128-bits of integer data from memory into a new vector.
1256///
1257/// `mem_addr` does not need to be aligned on any particular boundary.
1258///
1259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1260#[inline]
1261#[target_feature(enable = "sse2")]
1262#[cfg_attr(test, assert_instr(movups))]
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1265    let mut dst: __m128i = _mm_undefined_si128();
1266    ptr::copy_nonoverlapping(
1267        mem_addr as *const u8,
1268        ptr::addr_of_mut!(dst) as *mut u8,
1269        mem::size_of::<__m128i>(),
1270    );
1271    dst
1272}
1273
1274/// Conditionally store 8-bit integer elements from `a` into memory using
1275/// `mask`.
1276///
1277/// Elements are not stored when the highest bit is not set in the
1278/// corresponding element.
1279///
1280/// `mem_addr` should correspond to a 128-bit memory location and does not need
1281/// to be aligned on any particular boundary.
1282///
1283/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1284#[inline]
1285#[target_feature(enable = "sse2")]
1286#[cfg_attr(test, assert_instr(maskmovdqu))]
1287#[stable(feature = "simd_x86", since = "1.27.0")]
1288pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1289    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1290}
1291
1292/// Stores 128-bits of integer data from `a` into memory.
1293///
1294/// `mem_addr` must be aligned on a 16-byte boundary.
1295///
1296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1297#[inline]
1298#[target_feature(enable = "sse2")]
1299#[cfg_attr(
1300    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1301    assert_instr(movaps)
1302)]
1303#[stable(feature = "simd_x86", since = "1.27.0")]
1304pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1305    *mem_addr = a;
1306}
1307
1308/// Stores 128-bits of integer data from `a` into memory.
1309///
1310/// `mem_addr` does not need to be aligned on any particular boundary.
1311///
1312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1313#[inline]
1314#[target_feature(enable = "sse2")]
1315#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1316#[stable(feature = "simd_x86", since = "1.27.0")]
1317pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1318    mem_addr.write_unaligned(a);
1319}
1320
1321/// Stores the lower 64-bit integer `a` to a memory location.
1322///
1323/// `mem_addr` does not need to be aligned on any particular boundary.
1324///
1325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1326#[inline]
1327#[target_feature(enable = "sse2")]
1328#[stable(feature = "simd_x86", since = "1.27.0")]
1329pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1330    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1331}
1332
1333/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1334/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1335/// used again soon).
1336///
1337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1338///
1339/// # Safety of non-temporal stores
1340///
1341/// After using this intrinsic, but before any other access to the memory that this intrinsic
1342/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1343/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1344/// return.
1345///
1346/// See [`_mm_sfence`] for details.
1347#[inline]
1348#[target_feature(enable = "sse2")]
1349#[cfg_attr(test, assert_instr(movntdq))]
1350#[stable(feature = "simd_x86", since = "1.27.0")]
1351pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1352    crate::arch::asm!(
1353        vps!("movntdq",  ",{a}"),
1354        p = in(reg) mem_addr,
1355        a = in(xmm_reg) a,
1356        options(nostack, preserves_flags),
1357    );
1358}
1359
1360/// Stores a 32-bit integer value in the specified memory location.
1361/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1362/// used again soon).
1363///
1364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1365///
1366/// # Safety of non-temporal stores
1367///
1368/// After using this intrinsic, but before any other access to the memory that this intrinsic
1369/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1370/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1371/// return.
1372///
1373/// See [`_mm_sfence`] for details.
1374#[inline]
1375#[target_feature(enable = "sse2")]
1376#[cfg_attr(test, assert_instr(movnti))]
1377#[stable(feature = "simd_x86", since = "1.27.0")]
1378pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1379    crate::arch::asm!(
1380        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1381        p = in(reg) mem_addr,
1382        a = in(reg) a,
1383        options(nostack, preserves_flags),
1384    );
1385}
1386
1387/// Returns a vector where the low element is extracted from `a` and its upper
1388/// element is zero.
1389///
1390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1391#[inline]
1392#[target_feature(enable = "sse2")]
1393// FIXME movd on msvc, movd on i686
1394#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movq))]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1397    unsafe {
1398        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1399        transmute(r)
1400    }
1401}
1402
1403/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1404/// using signed saturation.
1405///
1406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1407#[inline]
1408#[target_feature(enable = "sse2")]
1409#[cfg_attr(test, assert_instr(packsswb))]
1410#[stable(feature = "simd_x86", since = "1.27.0")]
1411pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1412    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1413}
1414
1415/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1416/// using signed saturation.
1417///
1418/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1419#[inline]
1420#[target_feature(enable = "sse2")]
1421#[cfg_attr(test, assert_instr(packssdw))]
1422#[stable(feature = "simd_x86", since = "1.27.0")]
1423pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1424    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1425}
1426
1427/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1428/// using unsigned saturation.
1429///
1430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1431#[inline]
1432#[target_feature(enable = "sse2")]
1433#[cfg_attr(test, assert_instr(packuswb))]
1434#[stable(feature = "simd_x86", since = "1.27.0")]
1435pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1436    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1437}
1438
1439/// Returns the `imm8` element of `a`.
1440///
1441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1442#[inline]
1443#[target_feature(enable = "sse2")]
1444#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1445#[rustc_legacy_const_generics(1)]
1446#[stable(feature = "simd_x86", since = "1.27.0")]
1447pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1448    static_assert_uimm_bits!(IMM8, 3);
1449    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1450}
1451
1452/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1453///
1454/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1455#[inline]
1456#[target_feature(enable = "sse2")]
1457#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1458#[rustc_legacy_const_generics(2)]
1459#[stable(feature = "simd_x86", since = "1.27.0")]
1460pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1461    static_assert_uimm_bits!(IMM8, 3);
1462    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1463}
1464
1465/// Returns a mask of the most significant bit of each element in `a`.
1466///
1467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1468#[inline]
1469#[target_feature(enable = "sse2")]
1470#[cfg_attr(test, assert_instr(pmovmskb))]
1471#[stable(feature = "simd_x86", since = "1.27.0")]
1472pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1473    unsafe {
1474        let z = i8x16::ZERO;
1475        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1476        simd_bitmask::<_, u16>(m) as u32 as i32
1477    }
1478}
1479
1480/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1481///
1482/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1483#[inline]
1484#[target_feature(enable = "sse2")]
1485#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1486#[rustc_legacy_const_generics(1)]
1487#[stable(feature = "simd_x86", since = "1.27.0")]
1488pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1489    static_assert_uimm_bits!(IMM8, 8);
1490    unsafe {
1491        let a = a.as_i32x4();
1492        let x: i32x4 = simd_shuffle!(
1493            a,
1494            a,
1495            [
1496                IMM8 as u32 & 0b11,
1497                (IMM8 as u32 >> 2) & 0b11,
1498                (IMM8 as u32 >> 4) & 0b11,
1499                (IMM8 as u32 >> 6) & 0b11,
1500            ],
1501        );
1502        transmute(x)
1503    }
1504}
1505
1506/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1507/// `IMM8`.
1508///
1509/// Put the results in the high 64 bits of the returned vector, with the low 64
1510/// bits being copied from `a`.
1511///
1512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1513#[inline]
1514#[target_feature(enable = "sse2")]
1515#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1516#[rustc_legacy_const_generics(1)]
1517#[stable(feature = "simd_x86", since = "1.27.0")]
1518pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1519    static_assert_uimm_bits!(IMM8, 8);
1520    unsafe {
1521        let a = a.as_i16x8();
1522        let x: i16x8 = simd_shuffle!(
1523            a,
1524            a,
1525            [
1526                0,
1527                1,
1528                2,
1529                3,
1530                (IMM8 as u32 & 0b11) + 4,
1531                ((IMM8 as u32 >> 2) & 0b11) + 4,
1532                ((IMM8 as u32 >> 4) & 0b11) + 4,
1533                ((IMM8 as u32 >> 6) & 0b11) + 4,
1534            ],
1535        );
1536        transmute(x)
1537    }
1538}
1539
1540/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1541/// `IMM8`.
1542///
1543/// Put the results in the low 64 bits of the returned vector, with the high 64
1544/// bits being copied from `a`.
1545///
1546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1547#[inline]
1548#[target_feature(enable = "sse2")]
1549#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1550#[rustc_legacy_const_generics(1)]
1551#[stable(feature = "simd_x86", since = "1.27.0")]
1552pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1553    static_assert_uimm_bits!(IMM8, 8);
1554    unsafe {
1555        let a = a.as_i16x8();
1556        let x: i16x8 = simd_shuffle!(
1557            a,
1558            a,
1559            [
1560                IMM8 as u32 & 0b11,
1561                (IMM8 as u32 >> 2) & 0b11,
1562                (IMM8 as u32 >> 4) & 0b11,
1563                (IMM8 as u32 >> 6) & 0b11,
1564                4,
1565                5,
1566                6,
1567                7,
1568            ],
1569        );
1570        transmute(x)
1571    }
1572}
1573
1574/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1575///
1576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1577#[inline]
1578#[target_feature(enable = "sse2")]
1579#[cfg_attr(test, assert_instr(punpckhbw))]
1580#[stable(feature = "simd_x86", since = "1.27.0")]
1581pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1582    unsafe {
1583        transmute::<i8x16, _>(simd_shuffle!(
1584            a.as_i8x16(),
1585            b.as_i8x16(),
1586            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1587        ))
1588    }
1589}
1590
1591/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1592///
1593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1594#[inline]
1595#[target_feature(enable = "sse2")]
1596#[cfg_attr(test, assert_instr(punpckhwd))]
1597#[stable(feature = "simd_x86", since = "1.27.0")]
1598pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1599    unsafe {
1600        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1601        transmute::<i16x8, _>(x)
1602    }
1603}
1604
1605/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1606///
1607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1608#[inline]
1609#[target_feature(enable = "sse2")]
1610#[cfg_attr(test, assert_instr(unpckhps))]
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1613    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1614}
1615
1616/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1617///
1618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1619#[inline]
1620#[target_feature(enable = "sse2")]
1621#[cfg_attr(test, assert_instr(unpckhpd))]
1622#[stable(feature = "simd_x86", since = "1.27.0")]
1623pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1624    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1625}
1626
1627/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1628///
1629/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1630#[inline]
1631#[target_feature(enable = "sse2")]
1632#[cfg_attr(test, assert_instr(punpcklbw))]
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1635    unsafe {
1636        transmute::<i8x16, _>(simd_shuffle!(
1637            a.as_i8x16(),
1638            b.as_i8x16(),
1639            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1640        ))
1641    }
1642}
1643
1644/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1645///
1646/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1647#[inline]
1648#[target_feature(enable = "sse2")]
1649#[cfg_attr(test, assert_instr(punpcklwd))]
1650#[stable(feature = "simd_x86", since = "1.27.0")]
1651pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1652    unsafe {
1653        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1654        transmute::<i16x8, _>(x)
1655    }
1656}
1657
1658/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1659///
1660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1661#[inline]
1662#[target_feature(enable = "sse2")]
1663#[cfg_attr(test, assert_instr(unpcklps))]
1664#[stable(feature = "simd_x86", since = "1.27.0")]
1665pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1666    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1667}
1668
1669/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1670///
1671/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1672#[inline]
1673#[target_feature(enable = "sse2")]
1674#[cfg_attr(test, assert_instr(movlhps))]
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1677    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1678}
1679
1680/// Returns a new vector with the low element of `a` replaced by the sum of the
1681/// low elements of `a` and `b`.
1682///
1683/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1684#[inline]
1685#[target_feature(enable = "sse2")]
1686#[cfg_attr(test, assert_instr(addsd))]
1687#[stable(feature = "simd_x86", since = "1.27.0")]
1688pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1689    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1690}
1691
1692/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1693/// `b`.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1696#[inline]
1697#[target_feature(enable = "sse2")]
1698#[cfg_attr(test, assert_instr(addpd))]
1699#[stable(feature = "simd_x86", since = "1.27.0")]
1700pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1701    unsafe { simd_add(a, b) }
1702}
1703
1704/// Returns a new vector with the low element of `a` replaced by the result of
1705/// diving the lower element of `a` by the lower element of `b`.
1706///
1707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1708#[inline]
1709#[target_feature(enable = "sse2")]
1710#[cfg_attr(test, assert_instr(divsd))]
1711#[stable(feature = "simd_x86", since = "1.27.0")]
1712pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1713    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1714}
1715
1716/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1717/// packed elements in `b`.
1718///
1719/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1720#[inline]
1721#[target_feature(enable = "sse2")]
1722#[cfg_attr(test, assert_instr(divpd))]
1723#[stable(feature = "simd_x86", since = "1.27.0")]
1724pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1725    unsafe { simd_div(a, b) }
1726}
1727
1728/// Returns a new vector with the low element of `a` replaced by the maximum
1729/// of the lower elements of `a` and `b`.
1730///
1731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1732#[inline]
1733#[target_feature(enable = "sse2")]
1734#[cfg_attr(test, assert_instr(maxsd))]
1735#[stable(feature = "simd_x86", since = "1.27.0")]
1736pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1737    unsafe { maxsd(a, b) }
1738}
1739
1740/// Returns a new vector with the maximum values from corresponding elements in
1741/// `a` and `b`.
1742///
1743/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1744#[inline]
1745#[target_feature(enable = "sse2")]
1746#[cfg_attr(test, assert_instr(maxpd))]
1747#[stable(feature = "simd_x86", since = "1.27.0")]
1748pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1749    unsafe { maxpd(a, b) }
1750}
1751
1752/// Returns a new vector with the low element of `a` replaced by the minimum
1753/// of the lower elements of `a` and `b`.
1754///
1755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1756#[inline]
1757#[target_feature(enable = "sse2")]
1758#[cfg_attr(test, assert_instr(minsd))]
1759#[stable(feature = "simd_x86", since = "1.27.0")]
1760pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1761    unsafe { minsd(a, b) }
1762}
1763
1764/// Returns a new vector with the minimum values from corresponding elements in
1765/// `a` and `b`.
1766///
1767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1768#[inline]
1769#[target_feature(enable = "sse2")]
1770#[cfg_attr(test, assert_instr(minpd))]
1771#[stable(feature = "simd_x86", since = "1.27.0")]
1772pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1773    unsafe { minpd(a, b) }
1774}
1775
1776/// Returns a new vector with the low element of `a` replaced by multiplying the
1777/// low elements of `a` and `b`.
1778///
1779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1780#[inline]
1781#[target_feature(enable = "sse2")]
1782#[cfg_attr(test, assert_instr(mulsd))]
1783#[stable(feature = "simd_x86", since = "1.27.0")]
1784pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1785    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1786}
1787
1788/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1789/// and `b`.
1790///
1791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1792#[inline]
1793#[target_feature(enable = "sse2")]
1794#[cfg_attr(test, assert_instr(mulpd))]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1797    unsafe { simd_mul(a, b) }
1798}
1799
1800/// Returns a new vector with the low element of `a` replaced by the square
1801/// root of the lower element `b`.
1802///
1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1804#[inline]
1805#[target_feature(enable = "sse2")]
1806#[cfg_attr(test, assert_instr(sqrtsd))]
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1809    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1810}
1811
1812/// Returns a new vector with the square root of each of the values in `a`.
1813///
1814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1815#[inline]
1816#[target_feature(enable = "sse2")]
1817#[cfg_attr(test, assert_instr(sqrtpd))]
1818#[stable(feature = "simd_x86", since = "1.27.0")]
1819pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1820    unsafe { simd_fsqrt(a) }
1821}
1822
1823/// Returns a new vector with the low element of `a` replaced by subtracting the
1824/// low element by `b` from the low element of `a`.
1825///
1826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1827#[inline]
1828#[target_feature(enable = "sse2")]
1829#[cfg_attr(test, assert_instr(subsd))]
1830#[stable(feature = "simd_x86", since = "1.27.0")]
1831pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1832    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1833}
1834
1835/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1836/// from `a`.
1837///
1838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1839#[inline]
1840#[target_feature(enable = "sse2")]
1841#[cfg_attr(test, assert_instr(subpd))]
1842#[stable(feature = "simd_x86", since = "1.27.0")]
1843pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1844    unsafe { simd_sub(a, b) }
1845}
1846
1847/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1848/// elements in `a` and `b`.
1849///
1850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1851#[inline]
1852#[target_feature(enable = "sse2")]
1853#[cfg_attr(test, assert_instr(andps))]
1854#[stable(feature = "simd_x86", since = "1.27.0")]
1855pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1856    unsafe {
1857        let a: __m128i = transmute(a);
1858        let b: __m128i = transmute(b);
1859        transmute(_mm_and_si128(a, b))
1860    }
1861}
1862
1863/// Computes the bitwise NOT of `a` and then AND with `b`.
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1866#[inline]
1867#[target_feature(enable = "sse2")]
1868#[cfg_attr(test, assert_instr(andnps))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1871    unsafe {
1872        let a: __m128i = transmute(a);
1873        let b: __m128i = transmute(b);
1874        transmute(_mm_andnot_si128(a, b))
1875    }
1876}
1877
1878/// Computes the bitwise OR of `a` and `b`.
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1881#[inline]
1882#[target_feature(enable = "sse2")]
1883#[cfg_attr(test, assert_instr(orps))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1886    unsafe {
1887        let a: __m128i = transmute(a);
1888        let b: __m128i = transmute(b);
1889        transmute(_mm_or_si128(a, b))
1890    }
1891}
1892
1893/// Computes the bitwise XOR of `a` and `b`.
1894///
1895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1896#[inline]
1897#[target_feature(enable = "sse2")]
1898#[cfg_attr(test, assert_instr(xorps))]
1899#[stable(feature = "simd_x86", since = "1.27.0")]
1900pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1901    unsafe {
1902        let a: __m128i = transmute(a);
1903        let b: __m128i = transmute(b);
1904        transmute(_mm_xor_si128(a, b))
1905    }
1906}
1907
1908/// Returns a new vector with the low element of `a` replaced by the equality
1909/// comparison of the lower elements of `a` and `b`.
1910///
1911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1912#[inline]
1913#[target_feature(enable = "sse2")]
1914#[cfg_attr(test, assert_instr(cmpeqsd))]
1915#[stable(feature = "simd_x86", since = "1.27.0")]
1916pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1917    unsafe { cmpsd(a, b, 0) }
1918}
1919
1920/// Returns a new vector with the low element of `a` replaced by the less-than
1921/// comparison of the lower elements of `a` and `b`.
1922///
1923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1924#[inline]
1925#[target_feature(enable = "sse2")]
1926#[cfg_attr(test, assert_instr(cmpltsd))]
1927#[stable(feature = "simd_x86", since = "1.27.0")]
1928pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1929    unsafe { cmpsd(a, b, 1) }
1930}
1931
1932/// Returns a new vector with the low element of `a` replaced by the
1933/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1934///
1935/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1936#[inline]
1937#[target_feature(enable = "sse2")]
1938#[cfg_attr(test, assert_instr(cmplesd))]
1939#[stable(feature = "simd_x86", since = "1.27.0")]
1940pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1941    unsafe { cmpsd(a, b, 2) }
1942}
1943
1944/// Returns a new vector with the low element of `a` replaced by the
1945/// greater-than comparison of the lower elements of `a` and `b`.
1946///
1947/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1948#[inline]
1949#[target_feature(enable = "sse2")]
1950#[cfg_attr(test, assert_instr(cmpltsd))]
1951#[stable(feature = "simd_x86", since = "1.27.0")]
1952pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1953    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1954}
1955
1956/// Returns a new vector with the low element of `a` replaced by the
1957/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1960#[inline]
1961#[target_feature(enable = "sse2")]
1962#[cfg_attr(test, assert_instr(cmplesd))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1965    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1966}
1967
1968/// Returns a new vector with the low element of `a` replaced by the result
1969/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1970/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1971/// otherwise.
1972///
1973/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1974#[inline]
1975#[target_feature(enable = "sse2")]
1976#[cfg_attr(test, assert_instr(cmpordsd))]
1977#[stable(feature = "simd_x86", since = "1.27.0")]
1978pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1979    unsafe { cmpsd(a, b, 7) }
1980}
1981
1982/// Returns a new vector with the low element of `a` replaced by the result of
1983/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1984/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
1985///
1986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
1987#[inline]
1988#[target_feature(enable = "sse2")]
1989#[cfg_attr(test, assert_instr(cmpunordsd))]
1990#[stable(feature = "simd_x86", since = "1.27.0")]
1991pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
1992    unsafe { cmpsd(a, b, 3) }
1993}
1994
1995/// Returns a new vector with the low element of `a` replaced by the not-equal
1996/// comparison of the lower elements of `a` and `b`.
1997///
1998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
1999#[inline]
2000#[target_feature(enable = "sse2")]
2001#[cfg_attr(test, assert_instr(cmpneqsd))]
2002#[stable(feature = "simd_x86", since = "1.27.0")]
2003pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2004    unsafe { cmpsd(a, b, 4) }
2005}
2006
2007/// Returns a new vector with the low element of `a` replaced by the
2008/// not-less-than comparison of the lower elements of `a` and `b`.
2009///
2010/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2011#[inline]
2012#[target_feature(enable = "sse2")]
2013#[cfg_attr(test, assert_instr(cmpnltsd))]
2014#[stable(feature = "simd_x86", since = "1.27.0")]
2015pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2016    unsafe { cmpsd(a, b, 5) }
2017}
2018
2019/// Returns a new vector with the low element of `a` replaced by the
2020/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2021///
2022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2023#[inline]
2024#[target_feature(enable = "sse2")]
2025#[cfg_attr(test, assert_instr(cmpnlesd))]
2026#[stable(feature = "simd_x86", since = "1.27.0")]
2027pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2028    unsafe { cmpsd(a, b, 6) }
2029}
2030
2031/// Returns a new vector with the low element of `a` replaced by the
2032/// not-greater-than comparison of the lower elements of `a` and `b`.
2033///
2034/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2035#[inline]
2036#[target_feature(enable = "sse2")]
2037#[cfg_attr(test, assert_instr(cmpnltsd))]
2038#[stable(feature = "simd_x86", since = "1.27.0")]
2039pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2040    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2041}
2042
2043/// Returns a new vector with the low element of `a` replaced by the
2044/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2045///
2046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2047#[inline]
2048#[target_feature(enable = "sse2")]
2049#[cfg_attr(test, assert_instr(cmpnlesd))]
2050#[stable(feature = "simd_x86", since = "1.27.0")]
2051pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2052    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2053}
2054
2055/// Compares corresponding elements in `a` and `b` for equality.
2056///
2057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2058#[inline]
2059#[target_feature(enable = "sse2")]
2060#[cfg_attr(test, assert_instr(cmpeqpd))]
2061#[stable(feature = "simd_x86", since = "1.27.0")]
2062pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2063    unsafe { cmppd(a, b, 0) }
2064}
2065
2066/// Compares corresponding elements in `a` and `b` for less-than.
2067///
2068/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2069#[inline]
2070#[target_feature(enable = "sse2")]
2071#[cfg_attr(test, assert_instr(cmpltpd))]
2072#[stable(feature = "simd_x86", since = "1.27.0")]
2073pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2074    unsafe { cmppd(a, b, 1) }
2075}
2076
2077/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2078///
2079/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2080#[inline]
2081#[target_feature(enable = "sse2")]
2082#[cfg_attr(test, assert_instr(cmplepd))]
2083#[stable(feature = "simd_x86", since = "1.27.0")]
2084pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2085    unsafe { cmppd(a, b, 2) }
2086}
2087
2088/// Compares corresponding elements in `a` and `b` for greater-than.
2089///
2090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2091#[inline]
2092#[target_feature(enable = "sse2")]
2093#[cfg_attr(test, assert_instr(cmpltpd))]
2094#[stable(feature = "simd_x86", since = "1.27.0")]
2095pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2096    _mm_cmplt_pd(b, a)
2097}
2098
2099/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2100///
2101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2102#[inline]
2103#[target_feature(enable = "sse2")]
2104#[cfg_attr(test, assert_instr(cmplepd))]
2105#[stable(feature = "simd_x86", since = "1.27.0")]
2106pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2107    _mm_cmple_pd(b, a)
2108}
2109
2110/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2111///
2112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2113#[inline]
2114#[target_feature(enable = "sse2")]
2115#[cfg_attr(test, assert_instr(cmpordpd))]
2116#[stable(feature = "simd_x86", since = "1.27.0")]
2117pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2118    unsafe { cmppd(a, b, 7) }
2119}
2120
2121/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2122///
2123/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2124#[inline]
2125#[target_feature(enable = "sse2")]
2126#[cfg_attr(test, assert_instr(cmpunordpd))]
2127#[stable(feature = "simd_x86", since = "1.27.0")]
2128pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2129    unsafe { cmppd(a, b, 3) }
2130}
2131
2132/// Compares corresponding elements in `a` and `b` for not-equal.
2133///
2134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2135#[inline]
2136#[target_feature(enable = "sse2")]
2137#[cfg_attr(test, assert_instr(cmpneqpd))]
2138#[stable(feature = "simd_x86", since = "1.27.0")]
2139pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2140    unsafe { cmppd(a, b, 4) }
2141}
2142
2143/// Compares corresponding elements in `a` and `b` for not-less-than.
2144///
2145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2146#[inline]
2147#[target_feature(enable = "sse2")]
2148#[cfg_attr(test, assert_instr(cmpnltpd))]
2149#[stable(feature = "simd_x86", since = "1.27.0")]
2150pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2151    unsafe { cmppd(a, b, 5) }
2152}
2153
2154/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2155///
2156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2157#[inline]
2158#[target_feature(enable = "sse2")]
2159#[cfg_attr(test, assert_instr(cmpnlepd))]
2160#[stable(feature = "simd_x86", since = "1.27.0")]
2161pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2162    unsafe { cmppd(a, b, 6) }
2163}
2164
2165/// Compares corresponding elements in `a` and `b` for not-greater-than.
2166///
2167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2168#[inline]
2169#[target_feature(enable = "sse2")]
2170#[cfg_attr(test, assert_instr(cmpnltpd))]
2171#[stable(feature = "simd_x86", since = "1.27.0")]
2172pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2173    _mm_cmpnlt_pd(b, a)
2174}
2175
2176/// Compares corresponding elements in `a` and `b` for
2177/// not-greater-than-or-equal.
2178///
2179/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2180#[inline]
2181#[target_feature(enable = "sse2")]
2182#[cfg_attr(test, assert_instr(cmpnlepd))]
2183#[stable(feature = "simd_x86", since = "1.27.0")]
2184pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2185    _mm_cmpnle_pd(b, a)
2186}
2187
2188/// Compares the lower element of `a` and `b` for equality.
2189///
2190/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2191#[inline]
2192#[target_feature(enable = "sse2")]
2193#[cfg_attr(test, assert_instr(comisd))]
2194#[stable(feature = "simd_x86", since = "1.27.0")]
2195pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2196    unsafe { comieqsd(a, b) }
2197}
2198
2199/// Compares the lower element of `a` and `b` for less-than.
2200///
2201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2202#[inline]
2203#[target_feature(enable = "sse2")]
2204#[cfg_attr(test, assert_instr(comisd))]
2205#[stable(feature = "simd_x86", since = "1.27.0")]
2206pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2207    unsafe { comiltsd(a, b) }
2208}
2209
2210/// Compares the lower element of `a` and `b` for less-than-or-equal.
2211///
2212/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2213#[inline]
2214#[target_feature(enable = "sse2")]
2215#[cfg_attr(test, assert_instr(comisd))]
2216#[stable(feature = "simd_x86", since = "1.27.0")]
2217pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2218    unsafe { comilesd(a, b) }
2219}
2220
2221/// Compares the lower element of `a` and `b` for greater-than.
2222///
2223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2224#[inline]
2225#[target_feature(enable = "sse2")]
2226#[cfg_attr(test, assert_instr(comisd))]
2227#[stable(feature = "simd_x86", since = "1.27.0")]
2228pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2229    unsafe { comigtsd(a, b) }
2230}
2231
2232/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2233///
2234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2235#[inline]
2236#[target_feature(enable = "sse2")]
2237#[cfg_attr(test, assert_instr(comisd))]
2238#[stable(feature = "simd_x86", since = "1.27.0")]
2239pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2240    unsafe { comigesd(a, b) }
2241}
2242
2243/// Compares the lower element of `a` and `b` for not-equal.
2244///
2245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2246#[inline]
2247#[target_feature(enable = "sse2")]
2248#[cfg_attr(test, assert_instr(comisd))]
2249#[stable(feature = "simd_x86", since = "1.27.0")]
2250pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2251    unsafe { comineqsd(a, b) }
2252}
2253
2254/// Compares the lower element of `a` and `b` for equality.
2255///
2256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2257#[inline]
2258#[target_feature(enable = "sse2")]
2259#[cfg_attr(test, assert_instr(ucomisd))]
2260#[stable(feature = "simd_x86", since = "1.27.0")]
2261pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2262    unsafe { ucomieqsd(a, b) }
2263}
2264
2265/// Compares the lower element of `a` and `b` for less-than.
2266///
2267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2268#[inline]
2269#[target_feature(enable = "sse2")]
2270#[cfg_attr(test, assert_instr(ucomisd))]
2271#[stable(feature = "simd_x86", since = "1.27.0")]
2272pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2273    unsafe { ucomiltsd(a, b) }
2274}
2275
2276/// Compares the lower element of `a` and `b` for less-than-or-equal.
2277///
2278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2279#[inline]
2280#[target_feature(enable = "sse2")]
2281#[cfg_attr(test, assert_instr(ucomisd))]
2282#[stable(feature = "simd_x86", since = "1.27.0")]
2283pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2284    unsafe { ucomilesd(a, b) }
2285}
2286
2287/// Compares the lower element of `a` and `b` for greater-than.
2288///
2289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2290#[inline]
2291#[target_feature(enable = "sse2")]
2292#[cfg_attr(test, assert_instr(ucomisd))]
2293#[stable(feature = "simd_x86", since = "1.27.0")]
2294pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2295    unsafe { ucomigtsd(a, b) }
2296}
2297
2298/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2299///
2300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2301#[inline]
2302#[target_feature(enable = "sse2")]
2303#[cfg_attr(test, assert_instr(ucomisd))]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2306    unsafe { ucomigesd(a, b) }
2307}
2308
2309/// Compares the lower element of `a` and `b` for not-equal.
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2312#[inline]
2313#[target_feature(enable = "sse2")]
2314#[cfg_attr(test, assert_instr(ucomisd))]
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2317    unsafe { ucomineqsd(a, b) }
2318}
2319
2320/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2321/// packed single-precision (32-bit) floating-point elements
2322///
2323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2324#[inline]
2325#[target_feature(enable = "sse2")]
2326#[cfg_attr(test, assert_instr(cvtpd2ps))]
2327#[stable(feature = "simd_x86", since = "1.27.0")]
2328pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2329    unsafe {
2330        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2331        let zero = f32x2::ZERO;
2332        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2333    }
2334}
2335
2336/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2337/// packed
2338/// double-precision (64-bit) floating-point elements.
2339///
2340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2341#[inline]
2342#[target_feature(enable = "sse2")]
2343#[cfg_attr(test, assert_instr(cvtps2pd))]
2344#[stable(feature = "simd_x86", since = "1.27.0")]
2345pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2346    unsafe {
2347        let a = a.as_f32x4();
2348        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2349    }
2350}
2351
2352/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2353/// packed 32-bit integers.
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2356#[inline]
2357#[target_feature(enable = "sse2")]
2358#[cfg_attr(test, assert_instr(cvtpd2dq))]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2361    unsafe { transmute(cvtpd2dq(a)) }
2362}
2363
2364/// Converts the lower double-precision (64-bit) floating-point element in a to
2365/// a 32-bit integer.
2366///
2367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2368#[inline]
2369#[target_feature(enable = "sse2")]
2370#[cfg_attr(test, assert_instr(cvtsd2si))]
2371#[stable(feature = "simd_x86", since = "1.27.0")]
2372pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2373    unsafe { cvtsd2si(a) }
2374}
2375
2376/// Converts the lower double-precision (64-bit) floating-point element in `b`
2377/// to a single-precision (32-bit) floating-point element, store the result in
2378/// the lower element of the return value, and copies the upper element from `a`
2379/// to the upper element the return value.
2380///
2381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2382#[inline]
2383#[target_feature(enable = "sse2")]
2384#[cfg_attr(test, assert_instr(cvtsd2ss))]
2385#[stable(feature = "simd_x86", since = "1.27.0")]
2386pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2387    unsafe { cvtsd2ss(a, b) }
2388}
2389
2390/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2391///
2392/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2393#[inline]
2394#[target_feature(enable = "sse2")]
2395#[stable(feature = "simd_x86", since = "1.27.0")]
2396pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2397    unsafe { simd_extract!(a, 0) }
2398}
2399
2400/// Converts the lower single-precision (32-bit) floating-point element in `b`
2401/// to a double-precision (64-bit) floating-point element, store the result in
2402/// the lower element of the return value, and copies the upper element from `a`
2403/// to the upper element the return value.
2404///
2405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2406#[inline]
2407#[target_feature(enable = "sse2")]
2408#[cfg_attr(test, assert_instr(cvtss2sd))]
2409#[stable(feature = "simd_x86", since = "1.27.0")]
2410pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2411    unsafe { cvtss2sd(a, b) }
2412}
2413
2414/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2415/// packed 32-bit integers with truncation.
2416///
2417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2418#[inline]
2419#[target_feature(enable = "sse2")]
2420#[cfg_attr(test, assert_instr(cvttpd2dq))]
2421#[stable(feature = "simd_x86", since = "1.27.0")]
2422pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2423    unsafe { transmute(cvttpd2dq(a)) }
2424}
2425
2426/// Converts the lower double-precision (64-bit) floating-point element in `a`
2427/// to a 32-bit integer with truncation.
2428///
2429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2430#[inline]
2431#[target_feature(enable = "sse2")]
2432#[cfg_attr(test, assert_instr(cvttsd2si))]
2433#[stable(feature = "simd_x86", since = "1.27.0")]
2434pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2435    unsafe { cvttsd2si(a) }
2436}
2437
2438/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2439/// packed 32-bit integers with truncation.
2440///
2441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2442#[inline]
2443#[target_feature(enable = "sse2")]
2444#[cfg_attr(test, assert_instr(cvttps2dq))]
2445#[stable(feature = "simd_x86", since = "1.27.0")]
2446pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2447    unsafe { transmute(cvttps2dq(a)) }
2448}
2449
2450/// Copies double-precision (64-bit) floating-point element `a` to the lower
2451/// element of the packed 64-bit return value.
2452///
2453/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2454#[inline]
2455#[target_feature(enable = "sse2")]
2456#[stable(feature = "simd_x86", since = "1.27.0")]
2457pub fn _mm_set_sd(a: f64) -> __m128d {
2458    _mm_set_pd(0.0, a)
2459}
2460
2461/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2462/// of the return value.
2463///
2464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2465#[inline]
2466#[target_feature(enable = "sse2")]
2467#[stable(feature = "simd_x86", since = "1.27.0")]
2468pub fn _mm_set1_pd(a: f64) -> __m128d {
2469    _mm_set_pd(a, a)
2470}
2471
2472/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2473/// of the return value.
2474///
2475/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2476#[inline]
2477#[target_feature(enable = "sse2")]
2478#[stable(feature = "simd_x86", since = "1.27.0")]
2479pub fn _mm_set_pd1(a: f64) -> __m128d {
2480    _mm_set_pd(a, a)
2481}
2482
2483/// Sets packed double-precision (64-bit) floating-point elements in the return
2484/// value with the supplied values.
2485///
2486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2487#[inline]
2488#[target_feature(enable = "sse2")]
2489#[stable(feature = "simd_x86", since = "1.27.0")]
2490pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2491    __m128d([b, a])
2492}
2493
2494/// Sets packed double-precision (64-bit) floating-point elements in the return
2495/// value with the supplied values in reverse order.
2496///
2497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2498#[inline]
2499#[target_feature(enable = "sse2")]
2500#[stable(feature = "simd_x86", since = "1.27.0")]
2501pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2502    _mm_set_pd(b, a)
2503}
2504
2505/// Returns packed double-precision (64-bit) floating-point elements with all
2506/// zeros.
2507///
2508/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2509#[inline]
2510#[target_feature(enable = "sse2")]
2511#[cfg_attr(test, assert_instr(xorp))]
2512#[stable(feature = "simd_x86", since = "1.27.0")]
2513pub fn _mm_setzero_pd() -> __m128d {
2514    const { unsafe { mem::zeroed() } }
2515}
2516
2517/// Returns a mask of the most significant bit of each element in `a`.
2518///
2519/// The mask is stored in the 2 least significant bits of the return value.
2520/// All other bits are set to `0`.
2521///
2522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2523#[inline]
2524#[target_feature(enable = "sse2")]
2525#[cfg_attr(test, assert_instr(movmskpd))]
2526#[stable(feature = "simd_x86", since = "1.27.0")]
2527pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2528    // Propagate the highest bit to the rest, because simd_bitmask
2529    // requires all-1 or all-0.
2530    unsafe {
2531        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2532        simd_bitmask::<i64x2, u8>(mask).into()
2533    }
2534}
2535
2536/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2537/// floating-point elements) from memory into the returned vector.
2538/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2539/// exception may be generated.
2540///
2541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2542#[inline]
2543#[target_feature(enable = "sse2")]
2544#[cfg_attr(
2545    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2546    assert_instr(movaps)
2547)]
2548#[stable(feature = "simd_x86", since = "1.27.0")]
2549#[allow(clippy::cast_ptr_alignment)]
2550pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2551    *(mem_addr as *const __m128d)
2552}
2553
2554/// Loads a 64-bit double-precision value to the low element of a
2555/// 128-bit integer vector and clears the upper element.
2556///
2557/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2558#[inline]
2559#[target_feature(enable = "sse2")]
2560#[cfg_attr(test, assert_instr(movsd))]
2561#[stable(feature = "simd_x86", since = "1.27.0")]
2562pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2563    _mm_setr_pd(*mem_addr, 0.)
2564}
2565
2566/// Loads a double-precision value into the high-order bits of a 128-bit
2567/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2568/// bits of the first operand.
2569///
2570/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2571#[inline]
2572#[target_feature(enable = "sse2")]
2573#[cfg_attr(test, assert_instr(movhps))]
2574#[stable(feature = "simd_x86", since = "1.27.0")]
2575pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2576    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2577}
2578
2579/// Loads a double-precision value into the low-order bits of a 128-bit
2580/// vector of `[2 x double]`. The high-order bits are copied from the
2581/// high-order bits of the first operand.
2582///
2583/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2584#[inline]
2585#[target_feature(enable = "sse2")]
2586#[cfg_attr(test, assert_instr(movlps))]
2587#[stable(feature = "simd_x86", since = "1.27.0")]
2588pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2589    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2590}
2591
2592/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2593/// aligned memory location.
2594/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2595/// used again soon).
2596///
2597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2598///
2599/// # Safety of non-temporal stores
2600///
2601/// After using this intrinsic, but before any other access to the memory that this intrinsic
2602/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2603/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2604/// return.
2605///
2606/// See [`_mm_sfence`] for details.
2607#[inline]
2608#[target_feature(enable = "sse2")]
2609#[cfg_attr(test, assert_instr(movntpd))]
2610#[stable(feature = "simd_x86", since = "1.27.0")]
2611#[allow(clippy::cast_ptr_alignment)]
2612pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2613    crate::arch::asm!(
2614        vps!("movntpd", ",{a}"),
2615        p = in(reg) mem_addr,
2616        a = in(xmm_reg) a,
2617        options(nostack, preserves_flags),
2618    );
2619}
2620
2621/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2622/// memory location.
2623///
2624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2625#[inline]
2626#[target_feature(enable = "sse2")]
2627#[cfg_attr(test, assert_instr(movlps))]
2628#[stable(feature = "simd_x86", since = "1.27.0")]
2629pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2630    *mem_addr = simd_extract!(a, 0)
2631}
2632
2633/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2634/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2635/// on a 16-byte boundary or a general-protection exception may be generated.
2636///
2637/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2638#[inline]
2639#[target_feature(enable = "sse2")]
2640#[cfg_attr(
2641    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2642    assert_instr(movaps)
2643)]
2644#[stable(feature = "simd_x86", since = "1.27.0")]
2645#[allow(clippy::cast_ptr_alignment)]
2646pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2647    *(mem_addr as *mut __m128d) = a;
2648}
2649
2650/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2651/// floating-point elements) from `a` into memory.
2652/// `mem_addr` does not need to be aligned on any particular boundary.
2653///
2654/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2655#[inline]
2656#[target_feature(enable = "sse2")]
2657#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2658#[stable(feature = "simd_x86", since = "1.27.0")]
2659pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2660    mem_addr.cast::<__m128d>().write_unaligned(a);
2661}
2662
2663/// Store 16-bit integer from the first element of a into memory.
2664///
2665/// `mem_addr` does not need to be aligned on any particular boundary.
2666///
2667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2668#[inline]
2669#[target_feature(enable = "sse2")]
2670#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2671pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2672    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2673}
2674
2675/// Store 32-bit integer from the first element of a into memory.
2676///
2677/// `mem_addr` does not need to be aligned on any particular boundary.
2678///
2679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2680#[inline]
2681#[target_feature(enable = "sse2")]
2682#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2683pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2684    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2685}
2686
2687/// Store 64-bit integer from the first element of a into memory.
2688///
2689/// `mem_addr` does not need to be aligned on any particular boundary.
2690///
2691/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2692#[inline]
2693#[target_feature(enable = "sse2")]
2694#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2695pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2696    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2697}
2698
2699/// Stores the lower double-precision (64-bit) floating-point element from `a`
2700/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2701/// 16-byte boundary or a general-protection exception may be generated.
2702///
2703/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2704#[inline]
2705#[target_feature(enable = "sse2")]
2706#[stable(feature = "simd_x86", since = "1.27.0")]
2707#[allow(clippy::cast_ptr_alignment)]
2708pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2709    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2710    *(mem_addr as *mut __m128d) = b;
2711}
2712
2713/// Stores the lower double-precision (64-bit) floating-point element from `a`
2714/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2715/// 16-byte boundary or a general-protection exception may be generated.
2716///
2717/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2718#[inline]
2719#[target_feature(enable = "sse2")]
2720#[stable(feature = "simd_x86", since = "1.27.0")]
2721#[allow(clippy::cast_ptr_alignment)]
2722pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2723    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2724    *(mem_addr as *mut __m128d) = b;
2725}
2726
2727/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2728/// memory in reverse order.
2729/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2730/// exception may be generated.
2731///
2732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2733#[inline]
2734#[target_feature(enable = "sse2")]
2735#[stable(feature = "simd_x86", since = "1.27.0")]
2736#[allow(clippy::cast_ptr_alignment)]
2737pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2738    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2739    *(mem_addr as *mut __m128d) = b;
2740}
2741
2742/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2743/// memory location.
2744///
2745/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2746#[inline]
2747#[target_feature(enable = "sse2")]
2748#[cfg_attr(test, assert_instr(movhps))]
2749#[stable(feature = "simd_x86", since = "1.27.0")]
2750pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2751    *mem_addr = simd_extract!(a, 1);
2752}
2753
2754/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2755/// memory location.
2756///
2757/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2758#[inline]
2759#[target_feature(enable = "sse2")]
2760#[cfg_attr(test, assert_instr(movlps))]
2761#[stable(feature = "simd_x86", since = "1.27.0")]
2762pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2763    *mem_addr = simd_extract!(a, 0);
2764}
2765
2766/// Loads a double-precision (64-bit) floating-point element from memory
2767/// into both elements of returned vector.
2768///
2769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2770#[inline]
2771#[target_feature(enable = "sse2")]
2772// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2773#[stable(feature = "simd_x86", since = "1.27.0")]
2774pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2775    let d = *mem_addr;
2776    _mm_setr_pd(d, d)
2777}
2778
2779/// Loads a double-precision (64-bit) floating-point element from memory
2780/// into both elements of returned vector.
2781///
2782/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2783#[inline]
2784#[target_feature(enable = "sse2")]
2785// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2786#[stable(feature = "simd_x86", since = "1.27.0")]
2787pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2788    _mm_load1_pd(mem_addr)
2789}
2790
2791/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2792/// the returned vector in reverse order. `mem_addr` must be aligned on a
2793/// 16-byte boundary or a general-protection exception may be generated.
2794///
2795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2796#[inline]
2797#[target_feature(enable = "sse2")]
2798#[cfg_attr(
2799    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
2800    assert_instr(movaps)
2801)]
2802#[stable(feature = "simd_x86", since = "1.27.0")]
2803pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2804    let a = _mm_load_pd(mem_addr);
2805    simd_shuffle!(a, a, [1, 0])
2806}
2807
2808/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2809/// floating-point elements) from memory into the returned vector.
2810/// `mem_addr` does not need to be aligned on any particular boundary.
2811///
2812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2813#[inline]
2814#[target_feature(enable = "sse2")]
2815#[cfg_attr(test, assert_instr(movups))]
2816#[stable(feature = "simd_x86", since = "1.27.0")]
2817pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2818    let mut dst = _mm_undefined_pd();
2819    ptr::copy_nonoverlapping(
2820        mem_addr as *const u8,
2821        ptr::addr_of_mut!(dst) as *mut u8,
2822        mem::size_of::<__m128d>(),
2823    );
2824    dst
2825}
2826
2827/// Loads unaligned 16-bits of integer data from memory into new vector.
2828///
2829/// `mem_addr` does not need to be aligned on any particular boundary.
2830///
2831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2832#[inline]
2833#[target_feature(enable = "sse2")]
2834#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2835pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2836    transmute(i16x8::new(
2837        ptr::read_unaligned(mem_addr as *const i16),
2838        0,
2839        0,
2840        0,
2841        0,
2842        0,
2843        0,
2844        0,
2845    ))
2846}
2847
2848/// Loads unaligned 32-bits of integer data from memory into new vector.
2849///
2850/// `mem_addr` does not need to be aligned on any particular boundary.
2851///
2852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2853#[inline]
2854#[target_feature(enable = "sse2")]
2855#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2856pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2857    transmute(i32x4::new(
2858        ptr::read_unaligned(mem_addr as *const i32),
2859        0,
2860        0,
2861        0,
2862    ))
2863}
2864
2865/// Loads unaligned 64-bits of integer data from memory into new vector.
2866///
2867/// `mem_addr` does not need to be aligned on any particular boundary.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2873pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2874    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2875}
2876
2877/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2878/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2879/// parameter as a specifier.
2880///
2881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2882#[inline]
2883#[target_feature(enable = "sse2")]
2884#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2885#[rustc_legacy_const_generics(2)]
2886#[stable(feature = "simd_x86", since = "1.27.0")]
2887pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2888    static_assert_uimm_bits!(MASK, 8);
2889    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2890}
2891
2892/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2893/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2894/// 64 bits are set to the upper 64 bits of the first parameter.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[cfg_attr(test, assert_instr(movsd))]
2900#[stable(feature = "simd_x86", since = "1.27.0")]
2901pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2902    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2903}
2904
2905/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2906/// floating-point vector of `[4 x float]`.
2907///
2908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2909#[inline]
2910#[target_feature(enable = "sse2")]
2911#[stable(feature = "simd_x86", since = "1.27.0")]
2912pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2913    unsafe { transmute(a) }
2914}
2915
2916/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2917/// integer vector.
2918///
2919/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2920#[inline]
2921#[target_feature(enable = "sse2")]
2922#[stable(feature = "simd_x86", since = "1.27.0")]
2923pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2924    unsafe { transmute(a) }
2925}
2926
2927/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2928/// floating-point vector of `[2 x double]`.
2929///
2930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2931#[inline]
2932#[target_feature(enable = "sse2")]
2933#[stable(feature = "simd_x86", since = "1.27.0")]
2934pub fn _mm_castps_pd(a: __m128) -> __m128d {
2935    unsafe { transmute(a) }
2936}
2937
2938/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2939/// integer vector.
2940///
2941/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2942#[inline]
2943#[target_feature(enable = "sse2")]
2944#[stable(feature = "simd_x86", since = "1.27.0")]
2945pub fn _mm_castps_si128(a: __m128) -> __m128i {
2946    unsafe { transmute(a) }
2947}
2948
2949/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2950/// of `[2 x double]`.
2951///
2952/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2953#[inline]
2954#[target_feature(enable = "sse2")]
2955#[stable(feature = "simd_x86", since = "1.27.0")]
2956pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2957    unsafe { transmute(a) }
2958}
2959
2960/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2961/// of `[4 x float]`.
2962///
2963/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2964#[inline]
2965#[target_feature(enable = "sse2")]
2966#[stable(feature = "simd_x86", since = "1.27.0")]
2967pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2968    unsafe { transmute(a) }
2969}
2970
2971/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
2972/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2973/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2974/// In practice, this is typically equivalent to [`mem::zeroed`].
2975///
2976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2977#[inline]
2978#[target_feature(enable = "sse2")]
2979#[stable(feature = "simd_x86", since = "1.27.0")]
2980pub fn _mm_undefined_pd() -> __m128d {
2981    const { unsafe { mem::zeroed() } }
2982}
2983
2984/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
2985/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2986/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2987/// In practice, this is typically equivalent to [`mem::zeroed`].
2988///
2989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
2990#[inline]
2991#[target_feature(enable = "sse2")]
2992#[stable(feature = "simd_x86", since = "1.27.0")]
2993pub fn _mm_undefined_si128() -> __m128i {
2994    const { unsafe { mem::zeroed() } }
2995}
2996
2997/// The resulting `__m128d` element is composed by the low-order values of
2998/// the two `__m128d` interleaved input elements, i.e.:
2999///
3000/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
3001/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
3002///
3003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
3004#[inline]
3005#[target_feature(enable = "sse2")]
3006#[cfg_attr(test, assert_instr(unpckhpd))]
3007#[stable(feature = "simd_x86", since = "1.27.0")]
3008pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
3009    unsafe { simd_shuffle!(a, b, [1, 3]) }
3010}
3011
3012/// The resulting `__m128d` element is composed by the high-order values of
3013/// the two `__m128d` interleaved input elements, i.e.:
3014///
3015/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3016/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3017///
3018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3019#[inline]
3020#[target_feature(enable = "sse2")]
3021#[cfg_attr(test, assert_instr(movlhps))]
3022#[stable(feature = "simd_x86", since = "1.27.0")]
3023pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3024    unsafe { simd_shuffle!(a, b, [0, 2]) }
3025}
3026
3027#[allow(improper_ctypes)]
3028unsafe extern "C" {
3029    #[link_name = "llvm.x86.sse2.pause"]
3030    fn pause();
3031    #[link_name = "llvm.x86.sse2.clflush"]
3032    fn clflush(p: *const u8);
3033    #[link_name = "llvm.x86.sse2.lfence"]
3034    fn lfence();
3035    #[link_name = "llvm.x86.sse2.mfence"]
3036    fn mfence();
3037    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3038    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3039    #[link_name = "llvm.x86.sse2.psad.bw"]
3040    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3041    #[link_name = "llvm.x86.sse2.psll.w"]
3042    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3043    #[link_name = "llvm.x86.sse2.psll.d"]
3044    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3045    #[link_name = "llvm.x86.sse2.psll.q"]
3046    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3047    #[link_name = "llvm.x86.sse2.psra.w"]
3048    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3049    #[link_name = "llvm.x86.sse2.psra.d"]
3050    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3051    #[link_name = "llvm.x86.sse2.psrl.w"]
3052    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3053    #[link_name = "llvm.x86.sse2.psrl.d"]
3054    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3055    #[link_name = "llvm.x86.sse2.psrl.q"]
3056    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3057    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3058    fn cvtps2dq(a: __m128) -> i32x4;
3059    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3060    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3061    #[link_name = "llvm.x86.sse2.packsswb.128"]
3062    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3063    #[link_name = "llvm.x86.sse2.packssdw.128"]
3064    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3065    #[link_name = "llvm.x86.sse2.packuswb.128"]
3066    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3067    #[link_name = "llvm.x86.sse2.max.sd"]
3068    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3069    #[link_name = "llvm.x86.sse2.max.pd"]
3070    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3071    #[link_name = "llvm.x86.sse2.min.sd"]
3072    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3073    #[link_name = "llvm.x86.sse2.min.pd"]
3074    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3075    #[link_name = "llvm.x86.sse2.cmp.sd"]
3076    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3077    #[link_name = "llvm.x86.sse2.cmp.pd"]
3078    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3079    #[link_name = "llvm.x86.sse2.comieq.sd"]
3080    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3081    #[link_name = "llvm.x86.sse2.comilt.sd"]
3082    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3083    #[link_name = "llvm.x86.sse2.comile.sd"]
3084    fn comilesd(a: __m128d, b: __m128d) -> i32;
3085    #[link_name = "llvm.x86.sse2.comigt.sd"]
3086    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3087    #[link_name = "llvm.x86.sse2.comige.sd"]
3088    fn comigesd(a: __m128d, b: __m128d) -> i32;
3089    #[link_name = "llvm.x86.sse2.comineq.sd"]
3090    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3091    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3092    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3093    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3094    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3095    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3096    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3097    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3098    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3099    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3100    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3101    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3102    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3103    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3104    fn cvtpd2dq(a: __m128d) -> i32x4;
3105    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3106    fn cvtsd2si(a: __m128d) -> i32;
3107    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3108    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3109    #[link_name = "llvm.x86.sse2.cvtss2sd"]
3110    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3111    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3112    fn cvttpd2dq(a: __m128d) -> i32x4;
3113    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3114    fn cvttsd2si(a: __m128d) -> i32;
3115    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3116    fn cvttps2dq(a: __m128) -> i32x4;
3117}
3118
3119#[cfg(test)]
3120mod tests {
3121    use crate::{
3122        core_arch::{simd::*, x86::*},
3123        hint::black_box,
3124    };
3125    use std::{
3126        boxed, f32, f64,
3127        mem::{self, transmute},
3128        ptr,
3129    };
3130    use stdarch_test::simd_test;
3131
3132    const NAN: f64 = f64::NAN;
3133
3134    #[test]
3135    fn test_mm_pause() {
3136        unsafe { _mm_pause() }
3137    }
3138
3139    #[simd_test(enable = "sse2")]
3140    unsafe fn test_mm_clflush() {
3141        let x = 0_u8;
3142        _mm_clflush(ptr::addr_of!(x));
3143    }
3144
3145    #[simd_test(enable = "sse2")]
3146    // Miri cannot support this until it is clear how it fits in the Rust memory model
3147    #[cfg_attr(miri, ignore)]
3148    unsafe fn test_mm_lfence() {
3149        _mm_lfence();
3150    }
3151
3152    #[simd_test(enable = "sse2")]
3153    // Miri cannot support this until it is clear how it fits in the Rust memory model
3154    #[cfg_attr(miri, ignore)]
3155    unsafe fn test_mm_mfence() {
3156        _mm_mfence();
3157    }
3158
3159    #[simd_test(enable = "sse2")]
3160    unsafe fn test_mm_add_epi8() {
3161        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3162        #[rustfmt::skip]
3163        let b = _mm_setr_epi8(
3164            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3165        );
3166        let r = _mm_add_epi8(a, b);
3167        #[rustfmt::skip]
3168        let e = _mm_setr_epi8(
3169            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3170        );
3171        assert_eq_m128i(r, e);
3172    }
3173
3174    #[simd_test(enable = "sse2")]
3175    unsafe fn test_mm_add_epi8_overflow() {
3176        let a = _mm_set1_epi8(0x7F);
3177        let b = _mm_set1_epi8(1);
3178        let r = _mm_add_epi8(a, b);
3179        assert_eq_m128i(r, _mm_set1_epi8(-128));
3180    }
3181
3182    #[simd_test(enable = "sse2")]
3183    unsafe fn test_mm_add_epi16() {
3184        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3185        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3186        let r = _mm_add_epi16(a, b);
3187        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3188        assert_eq_m128i(r, e);
3189    }
3190
3191    #[simd_test(enable = "sse2")]
3192    unsafe fn test_mm_add_epi32() {
3193        let a = _mm_setr_epi32(0, 1, 2, 3);
3194        let b = _mm_setr_epi32(4, 5, 6, 7);
3195        let r = _mm_add_epi32(a, b);
3196        let e = _mm_setr_epi32(4, 6, 8, 10);
3197        assert_eq_m128i(r, e);
3198    }
3199
3200    #[simd_test(enable = "sse2")]
3201    unsafe fn test_mm_add_epi64() {
3202        let a = _mm_setr_epi64x(0, 1);
3203        let b = _mm_setr_epi64x(2, 3);
3204        let r = _mm_add_epi64(a, b);
3205        let e = _mm_setr_epi64x(2, 4);
3206        assert_eq_m128i(r, e);
3207    }
3208
3209    #[simd_test(enable = "sse2")]
3210    unsafe fn test_mm_adds_epi8() {
3211        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3212        #[rustfmt::skip]
3213        let b = _mm_setr_epi8(
3214            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3215        );
3216        let r = _mm_adds_epi8(a, b);
3217        #[rustfmt::skip]
3218        let e = _mm_setr_epi8(
3219            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3220        );
3221        assert_eq_m128i(r, e);
3222    }
3223
3224    #[simd_test(enable = "sse2")]
3225    unsafe fn test_mm_adds_epi8_saturate_positive() {
3226        let a = _mm_set1_epi8(0x7F);
3227        let b = _mm_set1_epi8(1);
3228        let r = _mm_adds_epi8(a, b);
3229        assert_eq_m128i(r, a);
3230    }
3231
3232    #[simd_test(enable = "sse2")]
3233    unsafe fn test_mm_adds_epi8_saturate_negative() {
3234        let a = _mm_set1_epi8(-0x80);
3235        let b = _mm_set1_epi8(-1);
3236        let r = _mm_adds_epi8(a, b);
3237        assert_eq_m128i(r, a);
3238    }
3239
3240    #[simd_test(enable = "sse2")]
3241    unsafe fn test_mm_adds_epi16() {
3242        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3243        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3244        let r = _mm_adds_epi16(a, b);
3245        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3246        assert_eq_m128i(r, e);
3247    }
3248
3249    #[simd_test(enable = "sse2")]
3250    unsafe fn test_mm_adds_epi16_saturate_positive() {
3251        let a = _mm_set1_epi16(0x7FFF);
3252        let b = _mm_set1_epi16(1);
3253        let r = _mm_adds_epi16(a, b);
3254        assert_eq_m128i(r, a);
3255    }
3256
3257    #[simd_test(enable = "sse2")]
3258    unsafe fn test_mm_adds_epi16_saturate_negative() {
3259        let a = _mm_set1_epi16(-0x8000);
3260        let b = _mm_set1_epi16(-1);
3261        let r = _mm_adds_epi16(a, b);
3262        assert_eq_m128i(r, a);
3263    }
3264
3265    #[simd_test(enable = "sse2")]
3266    unsafe fn test_mm_adds_epu8() {
3267        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3268        #[rustfmt::skip]
3269        let b = _mm_setr_epi8(
3270            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3271        );
3272        let r = _mm_adds_epu8(a, b);
3273        #[rustfmt::skip]
3274        let e = _mm_setr_epi8(
3275            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3276        );
3277        assert_eq_m128i(r, e);
3278    }
3279
3280    #[simd_test(enable = "sse2")]
3281    unsafe fn test_mm_adds_epu8_saturate() {
3282        let a = _mm_set1_epi8(!0);
3283        let b = _mm_set1_epi8(1);
3284        let r = _mm_adds_epu8(a, b);
3285        assert_eq_m128i(r, a);
3286    }
3287
3288    #[simd_test(enable = "sse2")]
3289    unsafe fn test_mm_adds_epu16() {
3290        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3291        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3292        let r = _mm_adds_epu16(a, b);
3293        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3294        assert_eq_m128i(r, e);
3295    }
3296
3297    #[simd_test(enable = "sse2")]
3298    unsafe fn test_mm_adds_epu16_saturate() {
3299        let a = _mm_set1_epi16(!0);
3300        let b = _mm_set1_epi16(1);
3301        let r = _mm_adds_epu16(a, b);
3302        assert_eq_m128i(r, a);
3303    }
3304
3305    #[simd_test(enable = "sse2")]
3306    unsafe fn test_mm_avg_epu8() {
3307        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3308        let r = _mm_avg_epu8(a, b);
3309        assert_eq_m128i(r, _mm_set1_epi8(6));
3310    }
3311
3312    #[simd_test(enable = "sse2")]
3313    unsafe fn test_mm_avg_epu16() {
3314        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3315        let r = _mm_avg_epu16(a, b);
3316        assert_eq_m128i(r, _mm_set1_epi16(6));
3317    }
3318
3319    #[simd_test(enable = "sse2")]
3320    unsafe fn test_mm_madd_epi16() {
3321        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3322        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3323        let r = _mm_madd_epi16(a, b);
3324        let e = _mm_setr_epi32(29, 81, 149, 233);
3325        assert_eq_m128i(r, e);
3326
3327        // Test large values.
3328        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3329        let a = _mm_setr_epi16(
3330            i16::MAX,
3331            i16::MAX,
3332            i16::MIN,
3333            i16::MIN,
3334            i16::MIN,
3335            i16::MAX,
3336            0,
3337            0,
3338        );
3339        let b = _mm_setr_epi16(
3340            i16::MAX,
3341            i16::MAX,
3342            i16::MIN,
3343            i16::MIN,
3344            i16::MAX,
3345            i16::MIN,
3346            0,
3347            0,
3348        );
3349        let r = _mm_madd_epi16(a, b);
3350        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3351        assert_eq_m128i(r, e);
3352    }
3353
3354    #[simd_test(enable = "sse2")]
3355    unsafe fn test_mm_max_epi16() {
3356        let a = _mm_set1_epi16(1);
3357        let b = _mm_set1_epi16(-1);
3358        let r = _mm_max_epi16(a, b);
3359        assert_eq_m128i(r, a);
3360    }
3361
3362    #[simd_test(enable = "sse2")]
3363    unsafe fn test_mm_max_epu8() {
3364        let a = _mm_set1_epi8(1);
3365        let b = _mm_set1_epi8(!0);
3366        let r = _mm_max_epu8(a, b);
3367        assert_eq_m128i(r, b);
3368    }
3369
3370    #[simd_test(enable = "sse2")]
3371    unsafe fn test_mm_min_epi16() {
3372        let a = _mm_set1_epi16(1);
3373        let b = _mm_set1_epi16(-1);
3374        let r = _mm_min_epi16(a, b);
3375        assert_eq_m128i(r, b);
3376    }
3377
3378    #[simd_test(enable = "sse2")]
3379    unsafe fn test_mm_min_epu8() {
3380        let a = _mm_set1_epi8(1);
3381        let b = _mm_set1_epi8(!0);
3382        let r = _mm_min_epu8(a, b);
3383        assert_eq_m128i(r, a);
3384    }
3385
3386    #[simd_test(enable = "sse2")]
3387    unsafe fn test_mm_mulhi_epi16() {
3388        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3389        let r = _mm_mulhi_epi16(a, b);
3390        assert_eq_m128i(r, _mm_set1_epi16(-16));
3391    }
3392
3393    #[simd_test(enable = "sse2")]
3394    unsafe fn test_mm_mulhi_epu16() {
3395        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3396        let r = _mm_mulhi_epu16(a, b);
3397        assert_eq_m128i(r, _mm_set1_epi16(15));
3398    }
3399
3400    #[simd_test(enable = "sse2")]
3401    unsafe fn test_mm_mullo_epi16() {
3402        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3403        let r = _mm_mullo_epi16(a, b);
3404        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3405    }
3406
3407    #[simd_test(enable = "sse2")]
3408    unsafe fn test_mm_mul_epu32() {
3409        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3410        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3411        let r = _mm_mul_epu32(a, b);
3412        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3413        assert_eq_m128i(r, e);
3414    }
3415
3416    #[simd_test(enable = "sse2")]
3417    unsafe fn test_mm_sad_epu8() {
3418        #[rustfmt::skip]
3419        let a = _mm_setr_epi8(
3420            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3421            1, 2, 3, 4,
3422            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3423            1, 2, 3, 4,
3424        );
3425        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3426        let r = _mm_sad_epu8(a, b);
3427        let e = _mm_setr_epi64x(1020, 614);
3428        assert_eq_m128i(r, e);
3429    }
3430
3431    #[simd_test(enable = "sse2")]
3432    unsafe fn test_mm_sub_epi8() {
3433        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3434        let r = _mm_sub_epi8(a, b);
3435        assert_eq_m128i(r, _mm_set1_epi8(-1));
3436    }
3437
3438    #[simd_test(enable = "sse2")]
3439    unsafe fn test_mm_sub_epi16() {
3440        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3441        let r = _mm_sub_epi16(a, b);
3442        assert_eq_m128i(r, _mm_set1_epi16(-1));
3443    }
3444
3445    #[simd_test(enable = "sse2")]
3446    unsafe fn test_mm_sub_epi32() {
3447        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3448        let r = _mm_sub_epi32(a, b);
3449        assert_eq_m128i(r, _mm_set1_epi32(-1));
3450    }
3451
3452    #[simd_test(enable = "sse2")]
3453    unsafe fn test_mm_sub_epi64() {
3454        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3455        let r = _mm_sub_epi64(a, b);
3456        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3457    }
3458
3459    #[simd_test(enable = "sse2")]
3460    unsafe fn test_mm_subs_epi8() {
3461        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3462        let r = _mm_subs_epi8(a, b);
3463        assert_eq_m128i(r, _mm_set1_epi8(3));
3464    }
3465
3466    #[simd_test(enable = "sse2")]
3467    unsafe fn test_mm_subs_epi8_saturate_positive() {
3468        let a = _mm_set1_epi8(0x7F);
3469        let b = _mm_set1_epi8(-1);
3470        let r = _mm_subs_epi8(a, b);
3471        assert_eq_m128i(r, a);
3472    }
3473
3474    #[simd_test(enable = "sse2")]
3475    unsafe fn test_mm_subs_epi8_saturate_negative() {
3476        let a = _mm_set1_epi8(-0x80);
3477        let b = _mm_set1_epi8(1);
3478        let r = _mm_subs_epi8(a, b);
3479        assert_eq_m128i(r, a);
3480    }
3481
3482    #[simd_test(enable = "sse2")]
3483    unsafe fn test_mm_subs_epi16() {
3484        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3485        let r = _mm_subs_epi16(a, b);
3486        assert_eq_m128i(r, _mm_set1_epi16(3));
3487    }
3488
3489    #[simd_test(enable = "sse2")]
3490    unsafe fn test_mm_subs_epi16_saturate_positive() {
3491        let a = _mm_set1_epi16(0x7FFF);
3492        let b = _mm_set1_epi16(-1);
3493        let r = _mm_subs_epi16(a, b);
3494        assert_eq_m128i(r, a);
3495    }
3496
3497    #[simd_test(enable = "sse2")]
3498    unsafe fn test_mm_subs_epi16_saturate_negative() {
3499        let a = _mm_set1_epi16(-0x8000);
3500        let b = _mm_set1_epi16(1);
3501        let r = _mm_subs_epi16(a, b);
3502        assert_eq_m128i(r, a);
3503    }
3504
3505    #[simd_test(enable = "sse2")]
3506    unsafe fn test_mm_subs_epu8() {
3507        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3508        let r = _mm_subs_epu8(a, b);
3509        assert_eq_m128i(r, _mm_set1_epi8(3));
3510    }
3511
3512    #[simd_test(enable = "sse2")]
3513    unsafe fn test_mm_subs_epu8_saturate() {
3514        let a = _mm_set1_epi8(0);
3515        let b = _mm_set1_epi8(1);
3516        let r = _mm_subs_epu8(a, b);
3517        assert_eq_m128i(r, a);
3518    }
3519
3520    #[simd_test(enable = "sse2")]
3521    unsafe fn test_mm_subs_epu16() {
3522        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3523        let r = _mm_subs_epu16(a, b);
3524        assert_eq_m128i(r, _mm_set1_epi16(3));
3525    }
3526
3527    #[simd_test(enable = "sse2")]
3528    unsafe fn test_mm_subs_epu16_saturate() {
3529        let a = _mm_set1_epi16(0);
3530        let b = _mm_set1_epi16(1);
3531        let r = _mm_subs_epu16(a, b);
3532        assert_eq_m128i(r, a);
3533    }
3534
3535    #[simd_test(enable = "sse2")]
3536    unsafe fn test_mm_slli_si128() {
3537        #[rustfmt::skip]
3538        let a = _mm_setr_epi8(
3539            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3540        );
3541        let r = _mm_slli_si128::<1>(a);
3542        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3543        assert_eq_m128i(r, e);
3544
3545        #[rustfmt::skip]
3546        let a = _mm_setr_epi8(
3547            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3548        );
3549        let r = _mm_slli_si128::<15>(a);
3550        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3551        assert_eq_m128i(r, e);
3552
3553        #[rustfmt::skip]
3554        let a = _mm_setr_epi8(
3555            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3556        );
3557        let r = _mm_slli_si128::<16>(a);
3558        assert_eq_m128i(r, _mm_set1_epi8(0));
3559    }
3560
3561    #[simd_test(enable = "sse2")]
3562    unsafe fn test_mm_slli_epi16() {
3563        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3564        let r = _mm_slli_epi16::<4>(a);
3565        assert_eq_m128i(
3566            r,
3567            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3568        );
3569        let r = _mm_slli_epi16::<16>(a);
3570        assert_eq_m128i(r, _mm_set1_epi16(0));
3571    }
3572
3573    #[simd_test(enable = "sse2")]
3574    unsafe fn test_mm_sll_epi16() {
3575        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3576        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3577        assert_eq_m128i(
3578            r,
3579            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3580        );
3581        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3582        assert_eq_m128i(r, a);
3583        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3584        assert_eq_m128i(r, _mm_set1_epi16(0));
3585        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3586        assert_eq_m128i(r, _mm_set1_epi16(0));
3587    }
3588
3589    #[simd_test(enable = "sse2")]
3590    unsafe fn test_mm_slli_epi32() {
3591        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3592        let r = _mm_slli_epi32::<4>(a);
3593        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3594        let r = _mm_slli_epi32::<32>(a);
3595        assert_eq_m128i(r, _mm_set1_epi32(0));
3596    }
3597
3598    #[simd_test(enable = "sse2")]
3599    unsafe fn test_mm_sll_epi32() {
3600        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3601        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3602        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3603        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3604        assert_eq_m128i(r, a);
3605        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3606        assert_eq_m128i(r, _mm_set1_epi32(0));
3607        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3608        assert_eq_m128i(r, _mm_set1_epi32(0));
3609    }
3610
3611    #[simd_test(enable = "sse2")]
3612    unsafe fn test_mm_slli_epi64() {
3613        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3614        let r = _mm_slli_epi64::<4>(a);
3615        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3616        let r = _mm_slli_epi64::<64>(a);
3617        assert_eq_m128i(r, _mm_set1_epi64x(0));
3618    }
3619
3620    #[simd_test(enable = "sse2")]
3621    unsafe fn test_mm_sll_epi64() {
3622        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3623        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3624        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3625        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3626        assert_eq_m128i(r, a);
3627        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3628        assert_eq_m128i(r, _mm_set1_epi64x(0));
3629        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3630        assert_eq_m128i(r, _mm_set1_epi64x(0));
3631    }
3632
3633    #[simd_test(enable = "sse2")]
3634    unsafe fn test_mm_srai_epi16() {
3635        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3636        let r = _mm_srai_epi16::<4>(a);
3637        assert_eq_m128i(
3638            r,
3639            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3640        );
3641        let r = _mm_srai_epi16::<16>(a);
3642        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3643    }
3644
3645    #[simd_test(enable = "sse2")]
3646    unsafe fn test_mm_sra_epi16() {
3647        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3648        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3649        assert_eq_m128i(
3650            r,
3651            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3652        );
3653        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3654        assert_eq_m128i(r, a);
3655        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3656        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3657        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3658        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3659    }
3660
3661    #[simd_test(enable = "sse2")]
3662    unsafe fn test_mm_srai_epi32() {
3663        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3664        let r = _mm_srai_epi32::<4>(a);
3665        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3666        let r = _mm_srai_epi32::<32>(a);
3667        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3668    }
3669
3670    #[simd_test(enable = "sse2")]
3671    unsafe fn test_mm_sra_epi32() {
3672        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3673        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3674        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3675        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3676        assert_eq_m128i(r, a);
3677        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3678        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3679        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3680        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3681    }
3682
3683    #[simd_test(enable = "sse2")]
3684    unsafe fn test_mm_srli_si128() {
3685        #[rustfmt::skip]
3686        let a = _mm_setr_epi8(
3687            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3688        );
3689        let r = _mm_srli_si128::<1>(a);
3690        #[rustfmt::skip]
3691        let e = _mm_setr_epi8(
3692            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3693        );
3694        assert_eq_m128i(r, e);
3695
3696        #[rustfmt::skip]
3697        let a = _mm_setr_epi8(
3698            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3699        );
3700        let r = _mm_srli_si128::<15>(a);
3701        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3702        assert_eq_m128i(r, e);
3703
3704        #[rustfmt::skip]
3705        let a = _mm_setr_epi8(
3706            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3707        );
3708        let r = _mm_srli_si128::<16>(a);
3709        assert_eq_m128i(r, _mm_set1_epi8(0));
3710    }
3711
3712    #[simd_test(enable = "sse2")]
3713    unsafe fn test_mm_srli_epi16() {
3714        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3715        let r = _mm_srli_epi16::<4>(a);
3716        assert_eq_m128i(
3717            r,
3718            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3719        );
3720        let r = _mm_srli_epi16::<16>(a);
3721        assert_eq_m128i(r, _mm_set1_epi16(0));
3722    }
3723
3724    #[simd_test(enable = "sse2")]
3725    unsafe fn test_mm_srl_epi16() {
3726        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3727        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3728        assert_eq_m128i(
3729            r,
3730            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3731        );
3732        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3733        assert_eq_m128i(r, a);
3734        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3735        assert_eq_m128i(r, _mm_set1_epi16(0));
3736        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3737        assert_eq_m128i(r, _mm_set1_epi16(0));
3738    }
3739
3740    #[simd_test(enable = "sse2")]
3741    unsafe fn test_mm_srli_epi32() {
3742        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3743        let r = _mm_srli_epi32::<4>(a);
3744        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3745        let r = _mm_srli_epi32::<32>(a);
3746        assert_eq_m128i(r, _mm_set1_epi32(0));
3747    }
3748
3749    #[simd_test(enable = "sse2")]
3750    unsafe fn test_mm_srl_epi32() {
3751        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3752        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3753        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3754        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3755        assert_eq_m128i(r, a);
3756        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3757        assert_eq_m128i(r, _mm_set1_epi32(0));
3758        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3759        assert_eq_m128i(r, _mm_set1_epi32(0));
3760    }
3761
3762    #[simd_test(enable = "sse2")]
3763    unsafe fn test_mm_srli_epi64() {
3764        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3765        let r = _mm_srli_epi64::<4>(a);
3766        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3767        let r = _mm_srli_epi64::<64>(a);
3768        assert_eq_m128i(r, _mm_set1_epi64x(0));
3769    }
3770
3771    #[simd_test(enable = "sse2")]
3772    unsafe fn test_mm_srl_epi64() {
3773        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3774        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3775        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3776        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3777        assert_eq_m128i(r, a);
3778        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3779        assert_eq_m128i(r, _mm_set1_epi64x(0));
3780        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3781        assert_eq_m128i(r, _mm_set1_epi64x(0));
3782    }
3783
3784    #[simd_test(enable = "sse2")]
3785    unsafe fn test_mm_and_si128() {
3786        let a = _mm_set1_epi8(5);
3787        let b = _mm_set1_epi8(3);
3788        let r = _mm_and_si128(a, b);
3789        assert_eq_m128i(r, _mm_set1_epi8(1));
3790    }
3791
3792    #[simd_test(enable = "sse2")]
3793    unsafe fn test_mm_andnot_si128() {
3794        let a = _mm_set1_epi8(5);
3795        let b = _mm_set1_epi8(3);
3796        let r = _mm_andnot_si128(a, b);
3797        assert_eq_m128i(r, _mm_set1_epi8(2));
3798    }
3799
3800    #[simd_test(enable = "sse2")]
3801    unsafe fn test_mm_or_si128() {
3802        let a = _mm_set1_epi8(5);
3803        let b = _mm_set1_epi8(3);
3804        let r = _mm_or_si128(a, b);
3805        assert_eq_m128i(r, _mm_set1_epi8(7));
3806    }
3807
3808    #[simd_test(enable = "sse2")]
3809    unsafe fn test_mm_xor_si128() {
3810        let a = _mm_set1_epi8(5);
3811        let b = _mm_set1_epi8(3);
3812        let r = _mm_xor_si128(a, b);
3813        assert_eq_m128i(r, _mm_set1_epi8(6));
3814    }
3815
3816    #[simd_test(enable = "sse2")]
3817    unsafe fn test_mm_cmpeq_epi8() {
3818        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3819        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3820        let r = _mm_cmpeq_epi8(a, b);
3821        #[rustfmt::skip]
3822        assert_eq_m128i(
3823            r,
3824            _mm_setr_epi8(
3825                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3826            )
3827        );
3828    }
3829
3830    #[simd_test(enable = "sse2")]
3831    unsafe fn test_mm_cmpeq_epi16() {
3832        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3833        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3834        let r = _mm_cmpeq_epi16(a, b);
3835        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3836    }
3837
3838    #[simd_test(enable = "sse2")]
3839    unsafe fn test_mm_cmpeq_epi32() {
3840        let a = _mm_setr_epi32(0, 1, 2, 3);
3841        let b = _mm_setr_epi32(3, 2, 2, 0);
3842        let r = _mm_cmpeq_epi32(a, b);
3843        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3844    }
3845
3846    #[simd_test(enable = "sse2")]
3847    unsafe fn test_mm_cmpgt_epi8() {
3848        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3849        let b = _mm_set1_epi8(0);
3850        let r = _mm_cmpgt_epi8(a, b);
3851        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3852        assert_eq_m128i(r, e);
3853    }
3854
3855    #[simd_test(enable = "sse2")]
3856    unsafe fn test_mm_cmpgt_epi16() {
3857        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3858        let b = _mm_set1_epi16(0);
3859        let r = _mm_cmpgt_epi16(a, b);
3860        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3861        assert_eq_m128i(r, e);
3862    }
3863
3864    #[simd_test(enable = "sse2")]
3865    unsafe fn test_mm_cmpgt_epi32() {
3866        let a = _mm_set_epi32(5, 0, 0, 0);
3867        let b = _mm_set1_epi32(0);
3868        let r = _mm_cmpgt_epi32(a, b);
3869        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3870    }
3871
3872    #[simd_test(enable = "sse2")]
3873    unsafe fn test_mm_cmplt_epi8() {
3874        let a = _mm_set1_epi8(0);
3875        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3876        let r = _mm_cmplt_epi8(a, b);
3877        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3878        assert_eq_m128i(r, e);
3879    }
3880
3881    #[simd_test(enable = "sse2")]
3882    unsafe fn test_mm_cmplt_epi16() {
3883        let a = _mm_set1_epi16(0);
3884        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3885        let r = _mm_cmplt_epi16(a, b);
3886        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3887        assert_eq_m128i(r, e);
3888    }
3889
3890    #[simd_test(enable = "sse2")]
3891    unsafe fn test_mm_cmplt_epi32() {
3892        let a = _mm_set1_epi32(0);
3893        let b = _mm_set_epi32(5, 0, 0, 0);
3894        let r = _mm_cmplt_epi32(a, b);
3895        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3896    }
3897
3898    #[simd_test(enable = "sse2")]
3899    unsafe fn test_mm_cvtepi32_pd() {
3900        let a = _mm_set_epi32(35, 25, 15, 5);
3901        let r = _mm_cvtepi32_pd(a);
3902        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3903    }
3904
3905    #[simd_test(enable = "sse2")]
3906    unsafe fn test_mm_cvtsi32_sd() {
3907        let a = _mm_set1_pd(3.5);
3908        let r = _mm_cvtsi32_sd(a, 5);
3909        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3910    }
3911
3912    #[simd_test(enable = "sse2")]
3913    unsafe fn test_mm_cvtepi32_ps() {
3914        let a = _mm_setr_epi32(1, 2, 3, 4);
3915        let r = _mm_cvtepi32_ps(a);
3916        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3917    }
3918
3919    #[simd_test(enable = "sse2")]
3920    unsafe fn test_mm_cvtps_epi32() {
3921        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3922        let r = _mm_cvtps_epi32(a);
3923        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3924    }
3925
3926    #[simd_test(enable = "sse2")]
3927    unsafe fn test_mm_cvtsi32_si128() {
3928        let r = _mm_cvtsi32_si128(5);
3929        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3930    }
3931
3932    #[simd_test(enable = "sse2")]
3933    unsafe fn test_mm_cvtsi128_si32() {
3934        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3935        assert_eq!(r, 5);
3936    }
3937
3938    #[simd_test(enable = "sse2")]
3939    unsafe fn test_mm_set_epi64x() {
3940        let r = _mm_set_epi64x(0, 1);
3941        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3942    }
3943
3944    #[simd_test(enable = "sse2")]
3945    unsafe fn test_mm_set_epi32() {
3946        let r = _mm_set_epi32(0, 1, 2, 3);
3947        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3948    }
3949
3950    #[simd_test(enable = "sse2")]
3951    unsafe fn test_mm_set_epi16() {
3952        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3953        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3954    }
3955
3956    #[simd_test(enable = "sse2")]
3957    unsafe fn test_mm_set_epi8() {
3958        #[rustfmt::skip]
3959        let r = _mm_set_epi8(
3960            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3961        );
3962        #[rustfmt::skip]
3963        let e = _mm_setr_epi8(
3964            15, 14, 13, 12, 11, 10, 9, 8,
3965            7, 6, 5, 4, 3, 2, 1, 0,
3966        );
3967        assert_eq_m128i(r, e);
3968    }
3969
3970    #[simd_test(enable = "sse2")]
3971    unsafe fn test_mm_set1_epi64x() {
3972        let r = _mm_set1_epi64x(1);
3973        assert_eq_m128i(r, _mm_set1_epi64x(1));
3974    }
3975
3976    #[simd_test(enable = "sse2")]
3977    unsafe fn test_mm_set1_epi32() {
3978        let r = _mm_set1_epi32(1);
3979        assert_eq_m128i(r, _mm_set1_epi32(1));
3980    }
3981
3982    #[simd_test(enable = "sse2")]
3983    unsafe fn test_mm_set1_epi16() {
3984        let r = _mm_set1_epi16(1);
3985        assert_eq_m128i(r, _mm_set1_epi16(1));
3986    }
3987
3988    #[simd_test(enable = "sse2")]
3989    unsafe fn test_mm_set1_epi8() {
3990        let r = _mm_set1_epi8(1);
3991        assert_eq_m128i(r, _mm_set1_epi8(1));
3992    }
3993
3994    #[simd_test(enable = "sse2")]
3995    unsafe fn test_mm_setr_epi32() {
3996        let r = _mm_setr_epi32(0, 1, 2, 3);
3997        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
3998    }
3999
4000    #[simd_test(enable = "sse2")]
4001    unsafe fn test_mm_setr_epi16() {
4002        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4003        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4004    }
4005
4006    #[simd_test(enable = "sse2")]
4007    unsafe fn test_mm_setr_epi8() {
4008        #[rustfmt::skip]
4009        let r = _mm_setr_epi8(
4010            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4011        );
4012        #[rustfmt::skip]
4013        let e = _mm_setr_epi8(
4014            0, 1, 2, 3, 4, 5, 6, 7,
4015            8, 9, 10, 11, 12, 13, 14, 15,
4016        );
4017        assert_eq_m128i(r, e);
4018    }
4019
4020    #[simd_test(enable = "sse2")]
4021    unsafe fn test_mm_setzero_si128() {
4022        let r = _mm_setzero_si128();
4023        assert_eq_m128i(r, _mm_set1_epi64x(0));
4024    }
4025
4026    #[simd_test(enable = "sse2")]
4027    unsafe fn test_mm_loadl_epi64() {
4028        let a = _mm_setr_epi64x(6, 5);
4029        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4030        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4031    }
4032
4033    #[simd_test(enable = "sse2")]
4034    unsafe fn test_mm_load_si128() {
4035        let a = _mm_set_epi64x(5, 6);
4036        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4037        assert_eq_m128i(a, r);
4038    }
4039
4040    #[simd_test(enable = "sse2")]
4041    unsafe fn test_mm_loadu_si128() {
4042        let a = _mm_set_epi64x(5, 6);
4043        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4044        assert_eq_m128i(a, r);
4045    }
4046
4047    #[simd_test(enable = "sse2")]
4048    // Miri cannot support this until it is clear how it fits in the Rust memory model
4049    // (non-temporal store)
4050    #[cfg_attr(miri, ignore)]
4051    unsafe fn test_mm_maskmoveu_si128() {
4052        let a = _mm_set1_epi8(9);
4053        #[rustfmt::skip]
4054        let mask = _mm_set_epi8(
4055            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4056            0, 0, 0, 0, 0, 0, 0, 0,
4057        );
4058        let mut r = _mm_set1_epi8(0);
4059        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4060        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4061        assert_eq_m128i(r, e);
4062    }
4063
4064    #[simd_test(enable = "sse2")]
4065    unsafe fn test_mm_store_si128() {
4066        let a = _mm_set1_epi8(9);
4067        let mut r = _mm_set1_epi8(0);
4068        _mm_store_si128(&mut r, a);
4069        assert_eq_m128i(r, a);
4070    }
4071
4072    #[simd_test(enable = "sse2")]
4073    unsafe fn test_mm_storeu_si128() {
4074        let a = _mm_set1_epi8(9);
4075        let mut r = _mm_set1_epi8(0);
4076        _mm_storeu_si128(&mut r, a);
4077        assert_eq_m128i(r, a);
4078    }
4079
4080    #[simd_test(enable = "sse2")]
4081    unsafe fn test_mm_storel_epi64() {
4082        let a = _mm_setr_epi64x(2, 9);
4083        let mut r = _mm_set1_epi8(0);
4084        _mm_storel_epi64(&mut r, a);
4085        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4086    }
4087
4088    #[simd_test(enable = "sse2")]
4089    // Miri cannot support this until it is clear how it fits in the Rust memory model
4090    // (non-temporal store)
4091    #[cfg_attr(miri, ignore)]
4092    unsafe fn test_mm_stream_si128() {
4093        let a = _mm_setr_epi32(1, 2, 3, 4);
4094        let mut r = _mm_undefined_si128();
4095        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4096        assert_eq_m128i(r, a);
4097    }
4098
4099    #[simd_test(enable = "sse2")]
4100    // Miri cannot support this until it is clear how it fits in the Rust memory model
4101    // (non-temporal store)
4102    #[cfg_attr(miri, ignore)]
4103    unsafe fn test_mm_stream_si32() {
4104        let a: i32 = 7;
4105        let mut mem = boxed::Box::<i32>::new(-1);
4106        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4107        assert_eq!(a, *mem);
4108    }
4109
4110    #[simd_test(enable = "sse2")]
4111    unsafe fn test_mm_move_epi64() {
4112        let a = _mm_setr_epi64x(5, 6);
4113        let r = _mm_move_epi64(a);
4114        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4115    }
4116
4117    #[simd_test(enable = "sse2")]
4118    unsafe fn test_mm_packs_epi16() {
4119        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4120        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4121        let r = _mm_packs_epi16(a, b);
4122        #[rustfmt::skip]
4123        assert_eq_m128i(
4124            r,
4125            _mm_setr_epi8(
4126                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4127            )
4128        );
4129    }
4130
4131    #[simd_test(enable = "sse2")]
4132    unsafe fn test_mm_packs_epi32() {
4133        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4134        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4135        let r = _mm_packs_epi32(a, b);
4136        assert_eq_m128i(
4137            r,
4138            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4139        );
4140    }
4141
4142    #[simd_test(enable = "sse2")]
4143    unsafe fn test_mm_packus_epi16() {
4144        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4145        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4146        let r = _mm_packus_epi16(a, b);
4147        assert_eq_m128i(
4148            r,
4149            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4150        );
4151    }
4152
4153    #[simd_test(enable = "sse2")]
4154    unsafe fn test_mm_extract_epi16() {
4155        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4156        let r1 = _mm_extract_epi16::<0>(a);
4157        let r2 = _mm_extract_epi16::<3>(a);
4158        assert_eq!(r1, 0xFFFF);
4159        assert_eq!(r2, 3);
4160    }
4161
4162    #[simd_test(enable = "sse2")]
4163    unsafe fn test_mm_insert_epi16() {
4164        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4165        let r = _mm_insert_epi16::<0>(a, 9);
4166        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4167        assert_eq_m128i(r, e);
4168    }
4169
4170    #[simd_test(enable = "sse2")]
4171    unsafe fn test_mm_movemask_epi8() {
4172        #[rustfmt::skip]
4173        let a = _mm_setr_epi8(
4174            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4175            0b0101, 0b1111_0000u8 as i8, 0, 0,
4176            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4177            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4178        );
4179        let r = _mm_movemask_epi8(a);
4180        assert_eq!(r, 0b10100110_00100101);
4181    }
4182
4183    #[simd_test(enable = "sse2")]
4184    unsafe fn test_mm_shuffle_epi32() {
4185        let a = _mm_setr_epi32(5, 10, 15, 20);
4186        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4187        let e = _mm_setr_epi32(20, 10, 10, 5);
4188        assert_eq_m128i(r, e);
4189    }
4190
4191    #[simd_test(enable = "sse2")]
4192    unsafe fn test_mm_shufflehi_epi16() {
4193        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4194        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4195        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4196        assert_eq_m128i(r, e);
4197    }
4198
4199    #[simd_test(enable = "sse2")]
4200    unsafe fn test_mm_shufflelo_epi16() {
4201        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4202        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4203        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4204        assert_eq_m128i(r, e);
4205    }
4206
4207    #[simd_test(enable = "sse2")]
4208    unsafe fn test_mm_unpackhi_epi8() {
4209        #[rustfmt::skip]
4210        let a = _mm_setr_epi8(
4211            0, 1, 2, 3, 4, 5, 6, 7,
4212            8, 9, 10, 11, 12, 13, 14, 15,
4213        );
4214        #[rustfmt::skip]
4215        let b = _mm_setr_epi8(
4216            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4217        );
4218        let r = _mm_unpackhi_epi8(a, b);
4219        #[rustfmt::skip]
4220        let e = _mm_setr_epi8(
4221            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4222        );
4223        assert_eq_m128i(r, e);
4224    }
4225
4226    #[simd_test(enable = "sse2")]
4227    unsafe fn test_mm_unpackhi_epi16() {
4228        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4229        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4230        let r = _mm_unpackhi_epi16(a, b);
4231        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4232        assert_eq_m128i(r, e);
4233    }
4234
4235    #[simd_test(enable = "sse2")]
4236    unsafe fn test_mm_unpackhi_epi32() {
4237        let a = _mm_setr_epi32(0, 1, 2, 3);
4238        let b = _mm_setr_epi32(4, 5, 6, 7);
4239        let r = _mm_unpackhi_epi32(a, b);
4240        let e = _mm_setr_epi32(2, 6, 3, 7);
4241        assert_eq_m128i(r, e);
4242    }
4243
4244    #[simd_test(enable = "sse2")]
4245    unsafe fn test_mm_unpackhi_epi64() {
4246        let a = _mm_setr_epi64x(0, 1);
4247        let b = _mm_setr_epi64x(2, 3);
4248        let r = _mm_unpackhi_epi64(a, b);
4249        let e = _mm_setr_epi64x(1, 3);
4250        assert_eq_m128i(r, e);
4251    }
4252
4253    #[simd_test(enable = "sse2")]
4254    unsafe fn test_mm_unpacklo_epi8() {
4255        #[rustfmt::skip]
4256        let a = _mm_setr_epi8(
4257            0, 1, 2, 3, 4, 5, 6, 7,
4258            8, 9, 10, 11, 12, 13, 14, 15,
4259        );
4260        #[rustfmt::skip]
4261        let b = _mm_setr_epi8(
4262            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4263        );
4264        let r = _mm_unpacklo_epi8(a, b);
4265        #[rustfmt::skip]
4266        let e = _mm_setr_epi8(
4267            0, 16, 1, 17, 2, 18, 3, 19,
4268            4, 20, 5, 21, 6, 22, 7, 23,
4269        );
4270        assert_eq_m128i(r, e);
4271    }
4272
4273    #[simd_test(enable = "sse2")]
4274    unsafe fn test_mm_unpacklo_epi16() {
4275        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4276        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4277        let r = _mm_unpacklo_epi16(a, b);
4278        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4279        assert_eq_m128i(r, e);
4280    }
4281
4282    #[simd_test(enable = "sse2")]
4283    unsafe fn test_mm_unpacklo_epi32() {
4284        let a = _mm_setr_epi32(0, 1, 2, 3);
4285        let b = _mm_setr_epi32(4, 5, 6, 7);
4286        let r = _mm_unpacklo_epi32(a, b);
4287        let e = _mm_setr_epi32(0, 4, 1, 5);
4288        assert_eq_m128i(r, e);
4289    }
4290
4291    #[simd_test(enable = "sse2")]
4292    unsafe fn test_mm_unpacklo_epi64() {
4293        let a = _mm_setr_epi64x(0, 1);
4294        let b = _mm_setr_epi64x(2, 3);
4295        let r = _mm_unpacklo_epi64(a, b);
4296        let e = _mm_setr_epi64x(0, 2);
4297        assert_eq_m128i(r, e);
4298    }
4299
4300    #[simd_test(enable = "sse2")]
4301    unsafe fn test_mm_add_sd() {
4302        let a = _mm_setr_pd(1.0, 2.0);
4303        let b = _mm_setr_pd(5.0, 10.0);
4304        let r = _mm_add_sd(a, b);
4305        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4306    }
4307
4308    #[simd_test(enable = "sse2")]
4309    unsafe fn test_mm_add_pd() {
4310        let a = _mm_setr_pd(1.0, 2.0);
4311        let b = _mm_setr_pd(5.0, 10.0);
4312        let r = _mm_add_pd(a, b);
4313        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4314    }
4315
4316    #[simd_test(enable = "sse2")]
4317    unsafe fn test_mm_div_sd() {
4318        let a = _mm_setr_pd(1.0, 2.0);
4319        let b = _mm_setr_pd(5.0, 10.0);
4320        let r = _mm_div_sd(a, b);
4321        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4322    }
4323
4324    #[simd_test(enable = "sse2")]
4325    unsafe fn test_mm_div_pd() {
4326        let a = _mm_setr_pd(1.0, 2.0);
4327        let b = _mm_setr_pd(5.0, 10.0);
4328        let r = _mm_div_pd(a, b);
4329        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4330    }
4331
4332    #[simd_test(enable = "sse2")]
4333    unsafe fn test_mm_max_sd() {
4334        let a = _mm_setr_pd(1.0, 2.0);
4335        let b = _mm_setr_pd(5.0, 10.0);
4336        let r = _mm_max_sd(a, b);
4337        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4338    }
4339
4340    #[simd_test(enable = "sse2")]
4341    unsafe fn test_mm_max_pd() {
4342        let a = _mm_setr_pd(1.0, 2.0);
4343        let b = _mm_setr_pd(5.0, 10.0);
4344        let r = _mm_max_pd(a, b);
4345        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4346
4347        // Check SSE(2)-specific semantics for -0.0 handling.
4348        let a = _mm_setr_pd(-0.0, 0.0);
4349        let b = _mm_setr_pd(0.0, 0.0);
4350        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4351        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4352        let a: [u8; 16] = transmute(a);
4353        let b: [u8; 16] = transmute(b);
4354        assert_eq!(r1, b);
4355        assert_eq!(r2, a);
4356        assert_ne!(a, b); // sanity check that -0.0 is actually present
4357    }
4358
4359    #[simd_test(enable = "sse2")]
4360    unsafe fn test_mm_min_sd() {
4361        let a = _mm_setr_pd(1.0, 2.0);
4362        let b = _mm_setr_pd(5.0, 10.0);
4363        let r = _mm_min_sd(a, b);
4364        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4365    }
4366
4367    #[simd_test(enable = "sse2")]
4368    unsafe fn test_mm_min_pd() {
4369        let a = _mm_setr_pd(1.0, 2.0);
4370        let b = _mm_setr_pd(5.0, 10.0);
4371        let r = _mm_min_pd(a, b);
4372        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4373
4374        // Check SSE(2)-specific semantics for -0.0 handling.
4375        let a = _mm_setr_pd(-0.0, 0.0);
4376        let b = _mm_setr_pd(0.0, 0.0);
4377        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4378        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4379        let a: [u8; 16] = transmute(a);
4380        let b: [u8; 16] = transmute(b);
4381        assert_eq!(r1, b);
4382        assert_eq!(r2, a);
4383        assert_ne!(a, b); // sanity check that -0.0 is actually present
4384    }
4385
4386    #[simd_test(enable = "sse2")]
4387    unsafe fn test_mm_mul_sd() {
4388        let a = _mm_setr_pd(1.0, 2.0);
4389        let b = _mm_setr_pd(5.0, 10.0);
4390        let r = _mm_mul_sd(a, b);
4391        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4392    }
4393
4394    #[simd_test(enable = "sse2")]
4395    unsafe fn test_mm_mul_pd() {
4396        let a = _mm_setr_pd(1.0, 2.0);
4397        let b = _mm_setr_pd(5.0, 10.0);
4398        let r = _mm_mul_pd(a, b);
4399        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4400    }
4401
4402    #[simd_test(enable = "sse2")]
4403    unsafe fn test_mm_sqrt_sd() {
4404        let a = _mm_setr_pd(1.0, 2.0);
4405        let b = _mm_setr_pd(5.0, 10.0);
4406        let r = _mm_sqrt_sd(a, b);
4407        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4408    }
4409
4410    #[simd_test(enable = "sse2")]
4411    unsafe fn test_mm_sqrt_pd() {
4412        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4413        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4414    }
4415
4416    #[simd_test(enable = "sse2")]
4417    unsafe fn test_mm_sub_sd() {
4418        let a = _mm_setr_pd(1.0, 2.0);
4419        let b = _mm_setr_pd(5.0, 10.0);
4420        let r = _mm_sub_sd(a, b);
4421        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4422    }
4423
4424    #[simd_test(enable = "sse2")]
4425    unsafe fn test_mm_sub_pd() {
4426        let a = _mm_setr_pd(1.0, 2.0);
4427        let b = _mm_setr_pd(5.0, 10.0);
4428        let r = _mm_sub_pd(a, b);
4429        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4430    }
4431
4432    #[simd_test(enable = "sse2")]
4433    unsafe fn test_mm_and_pd() {
4434        let a = transmute(u64x2::splat(5));
4435        let b = transmute(u64x2::splat(3));
4436        let r = _mm_and_pd(a, b);
4437        let e = transmute(u64x2::splat(1));
4438        assert_eq_m128d(r, e);
4439    }
4440
4441    #[simd_test(enable = "sse2")]
4442    unsafe fn test_mm_andnot_pd() {
4443        let a = transmute(u64x2::splat(5));
4444        let b = transmute(u64x2::splat(3));
4445        let r = _mm_andnot_pd(a, b);
4446        let e = transmute(u64x2::splat(2));
4447        assert_eq_m128d(r, e);
4448    }
4449
4450    #[simd_test(enable = "sse2")]
4451    unsafe fn test_mm_or_pd() {
4452        let a = transmute(u64x2::splat(5));
4453        let b = transmute(u64x2::splat(3));
4454        let r = _mm_or_pd(a, b);
4455        let e = transmute(u64x2::splat(7));
4456        assert_eq_m128d(r, e);
4457    }
4458
4459    #[simd_test(enable = "sse2")]
4460    unsafe fn test_mm_xor_pd() {
4461        let a = transmute(u64x2::splat(5));
4462        let b = transmute(u64x2::splat(3));
4463        let r = _mm_xor_pd(a, b);
4464        let e = transmute(u64x2::splat(6));
4465        assert_eq_m128d(r, e);
4466    }
4467
4468    #[simd_test(enable = "sse2")]
4469    unsafe fn test_mm_cmpeq_sd() {
4470        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4471        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4472        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4473        assert_eq_m128i(r, e);
4474    }
4475
4476    #[simd_test(enable = "sse2")]
4477    unsafe fn test_mm_cmplt_sd() {
4478        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4479        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4480        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4481        assert_eq_m128i(r, e);
4482    }
4483
4484    #[simd_test(enable = "sse2")]
4485    unsafe fn test_mm_cmple_sd() {
4486        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4487        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4488        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4489        assert_eq_m128i(r, e);
4490    }
4491
4492    #[simd_test(enable = "sse2")]
4493    unsafe fn test_mm_cmpgt_sd() {
4494        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4495        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4496        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4497        assert_eq_m128i(r, e);
4498    }
4499
4500    #[simd_test(enable = "sse2")]
4501    unsafe fn test_mm_cmpge_sd() {
4502        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4503        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4504        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4505        assert_eq_m128i(r, e);
4506    }
4507
4508    #[simd_test(enable = "sse2")]
4509    unsafe fn test_mm_cmpord_sd() {
4510        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4511        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4512        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4513        assert_eq_m128i(r, e);
4514    }
4515
4516    #[simd_test(enable = "sse2")]
4517    unsafe fn test_mm_cmpunord_sd() {
4518        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4519        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4520        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4521        assert_eq_m128i(r, e);
4522    }
4523
4524    #[simd_test(enable = "sse2")]
4525    unsafe fn test_mm_cmpneq_sd() {
4526        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4527        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4528        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4529        assert_eq_m128i(r, e);
4530    }
4531
4532    #[simd_test(enable = "sse2")]
4533    unsafe fn test_mm_cmpnlt_sd() {
4534        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4535        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4536        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4537        assert_eq_m128i(r, e);
4538    }
4539
4540    #[simd_test(enable = "sse2")]
4541    unsafe fn test_mm_cmpnle_sd() {
4542        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4543        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4544        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4545        assert_eq_m128i(r, e);
4546    }
4547
4548    #[simd_test(enable = "sse2")]
4549    unsafe fn test_mm_cmpngt_sd() {
4550        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4551        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4552        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4553        assert_eq_m128i(r, e);
4554    }
4555
4556    #[simd_test(enable = "sse2")]
4557    unsafe fn test_mm_cmpnge_sd() {
4558        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4559        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4560        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4561        assert_eq_m128i(r, e);
4562    }
4563
4564    #[simd_test(enable = "sse2")]
4565    unsafe fn test_mm_cmpeq_pd() {
4566        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4567        let e = _mm_setr_epi64x(!0, 0);
4568        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4569        assert_eq_m128i(r, e);
4570    }
4571
4572    #[simd_test(enable = "sse2")]
4573    unsafe fn test_mm_cmplt_pd() {
4574        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4575        let e = _mm_setr_epi64x(0, !0);
4576        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4577        assert_eq_m128i(r, e);
4578    }
4579
4580    #[simd_test(enable = "sse2")]
4581    unsafe fn test_mm_cmple_pd() {
4582        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4583        let e = _mm_setr_epi64x(!0, !0);
4584        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4585        assert_eq_m128i(r, e);
4586    }
4587
4588    #[simd_test(enable = "sse2")]
4589    unsafe fn test_mm_cmpgt_pd() {
4590        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4591        let e = _mm_setr_epi64x(0, 0);
4592        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4593        assert_eq_m128i(r, e);
4594    }
4595
4596    #[simd_test(enable = "sse2")]
4597    unsafe fn test_mm_cmpge_pd() {
4598        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4599        let e = _mm_setr_epi64x(!0, 0);
4600        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4601        assert_eq_m128i(r, e);
4602    }
4603
4604    #[simd_test(enable = "sse2")]
4605    unsafe fn test_mm_cmpord_pd() {
4606        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4607        let e = _mm_setr_epi64x(0, !0);
4608        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4609        assert_eq_m128i(r, e);
4610    }
4611
4612    #[simd_test(enable = "sse2")]
4613    unsafe fn test_mm_cmpunord_pd() {
4614        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4615        let e = _mm_setr_epi64x(!0, 0);
4616        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4617        assert_eq_m128i(r, e);
4618    }
4619
4620    #[simd_test(enable = "sse2")]
4621    unsafe fn test_mm_cmpneq_pd() {
4622        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4623        let e = _mm_setr_epi64x(!0, !0);
4624        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4625        assert_eq_m128i(r, e);
4626    }
4627
4628    #[simd_test(enable = "sse2")]
4629    unsafe fn test_mm_cmpnlt_pd() {
4630        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4631        let e = _mm_setr_epi64x(0, 0);
4632        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4633        assert_eq_m128i(r, e);
4634    }
4635
4636    #[simd_test(enable = "sse2")]
4637    unsafe fn test_mm_cmpnle_pd() {
4638        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4639        let e = _mm_setr_epi64x(0, 0);
4640        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4641        assert_eq_m128i(r, e);
4642    }
4643
4644    #[simd_test(enable = "sse2")]
4645    unsafe fn test_mm_cmpngt_pd() {
4646        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4647        let e = _mm_setr_epi64x(0, !0);
4648        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4649        assert_eq_m128i(r, e);
4650    }
4651
4652    #[simd_test(enable = "sse2")]
4653    unsafe fn test_mm_cmpnge_pd() {
4654        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4655        let e = _mm_setr_epi64x(0, !0);
4656        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4657        assert_eq_m128i(r, e);
4658    }
4659
4660    #[simd_test(enable = "sse2")]
4661    unsafe fn test_mm_comieq_sd() {
4662        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4663        assert!(_mm_comieq_sd(a, b) != 0);
4664
4665        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4666        assert!(_mm_comieq_sd(a, b) == 0);
4667    }
4668
4669    #[simd_test(enable = "sse2")]
4670    unsafe fn test_mm_comilt_sd() {
4671        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4672        assert!(_mm_comilt_sd(a, b) == 0);
4673    }
4674
4675    #[simd_test(enable = "sse2")]
4676    unsafe fn test_mm_comile_sd() {
4677        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4678        assert!(_mm_comile_sd(a, b) != 0);
4679    }
4680
4681    #[simd_test(enable = "sse2")]
4682    unsafe fn test_mm_comigt_sd() {
4683        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4684        assert!(_mm_comigt_sd(a, b) == 0);
4685    }
4686
4687    #[simd_test(enable = "sse2")]
4688    unsafe fn test_mm_comige_sd() {
4689        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4690        assert!(_mm_comige_sd(a, b) != 0);
4691    }
4692
4693    #[simd_test(enable = "sse2")]
4694    unsafe fn test_mm_comineq_sd() {
4695        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4696        assert!(_mm_comineq_sd(a, b) == 0);
4697    }
4698
4699    #[simd_test(enable = "sse2")]
4700    unsafe fn test_mm_ucomieq_sd() {
4701        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4702        assert!(_mm_ucomieq_sd(a, b) != 0);
4703
4704        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4705        assert!(_mm_ucomieq_sd(a, b) == 0);
4706    }
4707
4708    #[simd_test(enable = "sse2")]
4709    unsafe fn test_mm_ucomilt_sd() {
4710        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4711        assert!(_mm_ucomilt_sd(a, b) == 0);
4712    }
4713
4714    #[simd_test(enable = "sse2")]
4715    unsafe fn test_mm_ucomile_sd() {
4716        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4717        assert!(_mm_ucomile_sd(a, b) != 0);
4718    }
4719
4720    #[simd_test(enable = "sse2")]
4721    unsafe fn test_mm_ucomigt_sd() {
4722        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4723        assert!(_mm_ucomigt_sd(a, b) == 0);
4724    }
4725
4726    #[simd_test(enable = "sse2")]
4727    unsafe fn test_mm_ucomige_sd() {
4728        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4729        assert!(_mm_ucomige_sd(a, b) != 0);
4730    }
4731
4732    #[simd_test(enable = "sse2")]
4733    unsafe fn test_mm_ucomineq_sd() {
4734        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4735        assert!(_mm_ucomineq_sd(a, b) == 0);
4736    }
4737
4738    #[simd_test(enable = "sse2")]
4739    unsafe fn test_mm_movemask_pd() {
4740        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4741        assert_eq!(r, 0b01);
4742
4743        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4744        assert_eq!(r, 0b11);
4745    }
4746
4747    #[repr(align(16))]
4748    struct Memory {
4749        data: [f64; 4],
4750    }
4751
4752    #[simd_test(enable = "sse2")]
4753    unsafe fn test_mm_load_pd() {
4754        let mem = Memory {
4755            data: [1.0f64, 2.0, 3.0, 4.0],
4756        };
4757        let vals = &mem.data;
4758        let d = vals.as_ptr();
4759
4760        let r = _mm_load_pd(d);
4761        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4762    }
4763
4764    #[simd_test(enable = "sse2")]
4765    unsafe fn test_mm_load_sd() {
4766        let a = 1.;
4767        let expected = _mm_setr_pd(a, 0.);
4768        let r = _mm_load_sd(&a);
4769        assert_eq_m128d(r, expected);
4770    }
4771
4772    #[simd_test(enable = "sse2")]
4773    unsafe fn test_mm_loadh_pd() {
4774        let a = _mm_setr_pd(1., 2.);
4775        let b = 3.;
4776        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4777        let r = _mm_loadh_pd(a, &b);
4778        assert_eq_m128d(r, expected);
4779    }
4780
4781    #[simd_test(enable = "sse2")]
4782    unsafe fn test_mm_loadl_pd() {
4783        let a = _mm_setr_pd(1., 2.);
4784        let b = 3.;
4785        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4786        let r = _mm_loadl_pd(a, &b);
4787        assert_eq_m128d(r, expected);
4788    }
4789
4790    #[simd_test(enable = "sse2")]
4791    // Miri cannot support this until it is clear how it fits in the Rust memory model
4792    // (non-temporal store)
4793    #[cfg_attr(miri, ignore)]
4794    unsafe fn test_mm_stream_pd() {
4795        #[repr(align(128))]
4796        struct Memory {
4797            pub data: [f64; 2],
4798        }
4799        let a = _mm_set1_pd(7.0);
4800        let mut mem = Memory { data: [-1.0; 2] };
4801
4802        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4803        for i in 0..2 {
4804            assert_eq!(mem.data[i], get_m128d(a, i));
4805        }
4806    }
4807
4808    #[simd_test(enable = "sse2")]
4809    unsafe fn test_mm_store_sd() {
4810        let mut dest = 0.;
4811        let a = _mm_setr_pd(1., 2.);
4812        _mm_store_sd(&mut dest, a);
4813        assert_eq!(dest, _mm_cvtsd_f64(a));
4814    }
4815
4816    #[simd_test(enable = "sse2")]
4817    unsafe fn test_mm_store_pd() {
4818        let mut mem = Memory { data: [0.0f64; 4] };
4819        let vals = &mut mem.data;
4820        let a = _mm_setr_pd(1.0, 2.0);
4821        let d = vals.as_mut_ptr();
4822
4823        _mm_store_pd(d, *black_box(&a));
4824        assert_eq!(vals[0], 1.0);
4825        assert_eq!(vals[1], 2.0);
4826    }
4827
4828    #[simd_test(enable = "sse2")]
4829    unsafe fn test_mm_storeu_pd() {
4830        let mut mem = Memory { data: [0.0f64; 4] };
4831        let vals = &mut mem.data;
4832        let a = _mm_setr_pd(1.0, 2.0);
4833
4834        let mut ofs = 0;
4835        let mut p = vals.as_mut_ptr();
4836
4837        // Make sure p is **not** aligned to 16-byte boundary
4838        if (p as usize) & 0xf == 0 {
4839            ofs = 1;
4840            p = p.add(1);
4841        }
4842
4843        _mm_storeu_pd(p, *black_box(&a));
4844
4845        if ofs > 0 {
4846            assert_eq!(vals[ofs - 1], 0.0);
4847        }
4848        assert_eq!(vals[ofs + 0], 1.0);
4849        assert_eq!(vals[ofs + 1], 2.0);
4850    }
4851
4852    #[simd_test(enable = "sse2")]
4853    unsafe fn test_mm_storeu_si16() {
4854        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4855        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4856        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4857        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4858        assert_eq_m128i(r, e);
4859    }
4860
4861    #[simd_test(enable = "sse2")]
4862    unsafe fn test_mm_storeu_si32() {
4863        let a = _mm_setr_epi32(1, 2, 3, 4);
4864        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4865        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4866        let e = _mm_setr_epi32(1, 6, 7, 8);
4867        assert_eq_m128i(r, e);
4868    }
4869
4870    #[simd_test(enable = "sse2")]
4871    unsafe fn test_mm_storeu_si64() {
4872        let a = _mm_setr_epi64x(1, 2);
4873        let mut r = _mm_setr_epi64x(3, 4);
4874        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4875        let e = _mm_setr_epi64x(1, 4);
4876        assert_eq_m128i(r, e);
4877    }
4878
4879    #[simd_test(enable = "sse2")]
4880    unsafe fn test_mm_store1_pd() {
4881        let mut mem = Memory { data: [0.0f64; 4] };
4882        let vals = &mut mem.data;
4883        let a = _mm_setr_pd(1.0, 2.0);
4884        let d = vals.as_mut_ptr();
4885
4886        _mm_store1_pd(d, *black_box(&a));
4887        assert_eq!(vals[0], 1.0);
4888        assert_eq!(vals[1], 1.0);
4889    }
4890
4891    #[simd_test(enable = "sse2")]
4892    unsafe fn test_mm_store_pd1() {
4893        let mut mem = Memory { data: [0.0f64; 4] };
4894        let vals = &mut mem.data;
4895        let a = _mm_setr_pd(1.0, 2.0);
4896        let d = vals.as_mut_ptr();
4897
4898        _mm_store_pd1(d, *black_box(&a));
4899        assert_eq!(vals[0], 1.0);
4900        assert_eq!(vals[1], 1.0);
4901    }
4902
4903    #[simd_test(enable = "sse2")]
4904    unsafe fn test_mm_storer_pd() {
4905        let mut mem = Memory { data: [0.0f64; 4] };
4906        let vals = &mut mem.data;
4907        let a = _mm_setr_pd(1.0, 2.0);
4908        let d = vals.as_mut_ptr();
4909
4910        _mm_storer_pd(d, *black_box(&a));
4911        assert_eq!(vals[0], 2.0);
4912        assert_eq!(vals[1], 1.0);
4913    }
4914
4915    #[simd_test(enable = "sse2")]
4916    unsafe fn test_mm_storeh_pd() {
4917        let mut dest = 0.;
4918        let a = _mm_setr_pd(1., 2.);
4919        _mm_storeh_pd(&mut dest, a);
4920        assert_eq!(dest, get_m128d(a, 1));
4921    }
4922
4923    #[simd_test(enable = "sse2")]
4924    unsafe fn test_mm_storel_pd() {
4925        let mut dest = 0.;
4926        let a = _mm_setr_pd(1., 2.);
4927        _mm_storel_pd(&mut dest, a);
4928        assert_eq!(dest, _mm_cvtsd_f64(a));
4929    }
4930
4931    #[simd_test(enable = "sse2")]
4932    unsafe fn test_mm_loadr_pd() {
4933        let mut mem = Memory {
4934            data: [1.0f64, 2.0, 3.0, 4.0],
4935        };
4936        let vals = &mut mem.data;
4937        let d = vals.as_ptr();
4938
4939        let r = _mm_loadr_pd(d);
4940        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4941    }
4942
4943    #[simd_test(enable = "sse2")]
4944    unsafe fn test_mm_loadu_pd() {
4945        let mut mem = Memory {
4946            data: [1.0f64, 2.0, 3.0, 4.0],
4947        };
4948        let vals = &mut mem.data;
4949        let mut d = vals.as_ptr();
4950
4951        // make sure d is not aligned to 16-byte boundary
4952        let mut offset = 0;
4953        if (d as usize) & 0xf == 0 {
4954            offset = 1;
4955            d = d.add(offset);
4956        }
4957
4958        let r = _mm_loadu_pd(d);
4959        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4960        assert_eq_m128d(r, e);
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    unsafe fn test_mm_loadu_si16() {
4965        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4966        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4967        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4968    }
4969
4970    #[simd_test(enable = "sse2")]
4971    unsafe fn test_mm_loadu_si32() {
4972        let a = _mm_setr_epi32(1, 2, 3, 4);
4973        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4974        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4975    }
4976
4977    #[simd_test(enable = "sse2")]
4978    unsafe fn test_mm_loadu_si64() {
4979        let a = _mm_setr_epi64x(5, 6);
4980        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
4981        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4982    }
4983
4984    #[simd_test(enable = "sse2")]
4985    unsafe fn test_mm_cvtpd_ps() {
4986        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4987        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4988
4989        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4990        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
4991
4992        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
4993        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
4994
4995        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
4996        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
4997    }
4998
4999    #[simd_test(enable = "sse2")]
5000    unsafe fn test_mm_cvtps_pd() {
5001        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
5002        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
5003
5004        let r = _mm_cvtps_pd(_mm_setr_ps(
5005            f32::MAX,
5006            f32::INFINITY,
5007            f32::NEG_INFINITY,
5008            f32::MIN,
5009        ));
5010        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
5011    }
5012
5013    #[simd_test(enable = "sse2")]
5014    unsafe fn test_mm_cvtpd_epi32() {
5015        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5016        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5017
5018        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5019        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5020
5021        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5022        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5023
5024        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5025        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5026
5027        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5028        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5029    }
5030
5031    #[simd_test(enable = "sse2")]
5032    unsafe fn test_mm_cvtsd_si32() {
5033        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5034        assert_eq!(r, -2);
5035
5036        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5037        assert_eq!(r, i32::MIN);
5038
5039        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5040        assert_eq!(r, i32::MIN);
5041    }
5042
5043    #[simd_test(enable = "sse2")]
5044    unsafe fn test_mm_cvtsd_ss() {
5045        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5046        let b = _mm_setr_pd(2.0, -5.0);
5047
5048        let r = _mm_cvtsd_ss(a, b);
5049
5050        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5051
5052        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5053        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5054
5055        let r = _mm_cvtsd_ss(a, b);
5056
5057        assert_eq_m128(
5058            r,
5059            _mm_setr_ps(
5060                f32::INFINITY,
5061                f32::NEG_INFINITY,
5062                f32::MAX,
5063                f32::NEG_INFINITY,
5064            ),
5065        );
5066    }
5067
5068    #[simd_test(enable = "sse2")]
5069    unsafe fn test_mm_cvtsd_f64() {
5070        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5071        assert_eq!(r, -1.1);
5072    }
5073
5074    #[simd_test(enable = "sse2")]
5075    unsafe fn test_mm_cvtss_sd() {
5076        let a = _mm_setr_pd(-1.1, 2.2);
5077        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5078
5079        let r = _mm_cvtss_sd(a, b);
5080        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5081
5082        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5083        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5084
5085        let r = _mm_cvtss_sd(a, b);
5086        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5087    }
5088
5089    #[simd_test(enable = "sse2")]
5090    unsafe fn test_mm_cvttpd_epi32() {
5091        let a = _mm_setr_pd(-1.1, 2.2);
5092        let r = _mm_cvttpd_epi32(a);
5093        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5094
5095        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5096        let r = _mm_cvttpd_epi32(a);
5097        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5098    }
5099
5100    #[simd_test(enable = "sse2")]
5101    unsafe fn test_mm_cvttsd_si32() {
5102        let a = _mm_setr_pd(-1.1, 2.2);
5103        let r = _mm_cvttsd_si32(a);
5104        assert_eq!(r, -1);
5105
5106        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5107        let r = _mm_cvttsd_si32(a);
5108        assert_eq!(r, i32::MIN);
5109    }
5110
5111    #[simd_test(enable = "sse2")]
5112    unsafe fn test_mm_cvttps_epi32() {
5113        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5114        let r = _mm_cvttps_epi32(a);
5115        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5116
5117        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5118        let r = _mm_cvttps_epi32(a);
5119        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5120    }
5121
5122    #[simd_test(enable = "sse2")]
5123    unsafe fn test_mm_set_sd() {
5124        let r = _mm_set_sd(-1.0_f64);
5125        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5126    }
5127
5128    #[simd_test(enable = "sse2")]
5129    unsafe fn test_mm_set1_pd() {
5130        let r = _mm_set1_pd(-1.0_f64);
5131        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5132    }
5133
5134    #[simd_test(enable = "sse2")]
5135    unsafe fn test_mm_set_pd1() {
5136        let r = _mm_set_pd1(-2.0_f64);
5137        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5138    }
5139
5140    #[simd_test(enable = "sse2")]
5141    unsafe fn test_mm_set_pd() {
5142        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5143        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5144    }
5145
5146    #[simd_test(enable = "sse2")]
5147    unsafe fn test_mm_setr_pd() {
5148        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5149        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5150    }
5151
5152    #[simd_test(enable = "sse2")]
5153    unsafe fn test_mm_setzero_pd() {
5154        let r = _mm_setzero_pd();
5155        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5156    }
5157
5158    #[simd_test(enable = "sse2")]
5159    unsafe fn test_mm_load1_pd() {
5160        let d = -5.0;
5161        let r = _mm_load1_pd(&d);
5162        assert_eq_m128d(r, _mm_setr_pd(d, d));
5163    }
5164
5165    #[simd_test(enable = "sse2")]
5166    unsafe fn test_mm_load_pd1() {
5167        let d = -5.0;
5168        let r = _mm_load_pd1(&d);
5169        assert_eq_m128d(r, _mm_setr_pd(d, d));
5170    }
5171
5172    #[simd_test(enable = "sse2")]
5173    unsafe fn test_mm_unpackhi_pd() {
5174        let a = _mm_setr_pd(1.0, 2.0);
5175        let b = _mm_setr_pd(3.0, 4.0);
5176        let r = _mm_unpackhi_pd(a, b);
5177        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5178    }
5179
5180    #[simd_test(enable = "sse2")]
5181    unsafe fn test_mm_unpacklo_pd() {
5182        let a = _mm_setr_pd(1.0, 2.0);
5183        let b = _mm_setr_pd(3.0, 4.0);
5184        let r = _mm_unpacklo_pd(a, b);
5185        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5186    }
5187
5188    #[simd_test(enable = "sse2")]
5189    unsafe fn test_mm_shuffle_pd() {
5190        let a = _mm_setr_pd(1., 2.);
5191        let b = _mm_setr_pd(3., 4.);
5192        let expected = _mm_setr_pd(1., 3.);
5193        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5194        assert_eq_m128d(r, expected);
5195    }
5196
5197    #[simd_test(enable = "sse2")]
5198    unsafe fn test_mm_move_sd() {
5199        let a = _mm_setr_pd(1., 2.);
5200        let b = _mm_setr_pd(3., 4.);
5201        let expected = _mm_setr_pd(3., 2.);
5202        let r = _mm_move_sd(a, b);
5203        assert_eq_m128d(r, expected);
5204    }
5205
5206    #[simd_test(enable = "sse2")]
5207    unsafe fn test_mm_castpd_ps() {
5208        let a = _mm_set1_pd(0.);
5209        let expected = _mm_set1_ps(0.);
5210        let r = _mm_castpd_ps(a);
5211        assert_eq_m128(r, expected);
5212    }
5213
5214    #[simd_test(enable = "sse2")]
5215    unsafe fn test_mm_castpd_si128() {
5216        let a = _mm_set1_pd(0.);
5217        let expected = _mm_set1_epi64x(0);
5218        let r = _mm_castpd_si128(a);
5219        assert_eq_m128i(r, expected);
5220    }
5221
5222    #[simd_test(enable = "sse2")]
5223    unsafe fn test_mm_castps_pd() {
5224        let a = _mm_set1_ps(0.);
5225        let expected = _mm_set1_pd(0.);
5226        let r = _mm_castps_pd(a);
5227        assert_eq_m128d(r, expected);
5228    }
5229
5230    #[simd_test(enable = "sse2")]
5231    unsafe fn test_mm_castps_si128() {
5232        let a = _mm_set1_ps(0.);
5233        let expected = _mm_set1_epi32(0);
5234        let r = _mm_castps_si128(a);
5235        assert_eq_m128i(r, expected);
5236    }
5237
5238    #[simd_test(enable = "sse2")]
5239    unsafe fn test_mm_castsi128_pd() {
5240        let a = _mm_set1_epi64x(0);
5241        let expected = _mm_set1_pd(0.);
5242        let r = _mm_castsi128_pd(a);
5243        assert_eq_m128d(r, expected);
5244    }
5245
5246    #[simd_test(enable = "sse2")]
5247    unsafe fn test_mm_castsi128_ps() {
5248        let a = _mm_set1_epi32(0);
5249        let expected = _mm_set1_ps(0.);
5250        let r = _mm_castsi128_ps(a);
5251        assert_eq_m128(r, expected);
5252    }
5253}