core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub unsafe fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    pause()
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub unsafe fn _mm_lfence() {
53    lfence()
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub unsafe fn _mm_mfence() {
69    mfence()
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16()))
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8()))
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16()))
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8()))
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    let a = simd_cast::<_, u16x16>(a.as_u8x16());
169    let b = simd_cast::<_, u16x16>(b.as_u8x16());
170    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
171    transmute(simd_cast::<_, u8x16>(r))
172}
173
174/// Averages packed unsigned 16-bit integers in `a` and `b`.
175///
176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
177#[inline]
178#[target_feature(enable = "sse2")]
179#[cfg_attr(test, assert_instr(pavgw))]
180#[stable(feature = "simd_x86", since = "1.27.0")]
181pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
182    let a = simd_cast::<_, u32x8>(a.as_u16x8());
183    let b = simd_cast::<_, u32x8>(b.as_u16x8());
184    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
185    transmute(simd_cast::<_, u16x8>(r))
186}
187
188/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
189///
190/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
191/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
192/// intermediate 32-bit integers.
193///
194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
195#[inline]
196#[target_feature(enable = "sse2")]
197#[cfg_attr(test, assert_instr(pmaddwd))]
198#[stable(feature = "simd_x86", since = "1.27.0")]
199pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
200    transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
201}
202
203/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
204/// maximum values.
205///
206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
207#[inline]
208#[target_feature(enable = "sse2")]
209#[cfg_attr(test, assert_instr(pmaxsw))]
210#[stable(feature = "simd_x86", since = "1.27.0")]
211pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
212    let a = a.as_i16x8();
213    let b = b.as_i16x8();
214    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
215}
216
217/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
218/// packed maximum values.
219///
220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
221#[inline]
222#[target_feature(enable = "sse2")]
223#[cfg_attr(test, assert_instr(pmaxub))]
224#[stable(feature = "simd_x86", since = "1.27.0")]
225pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
226    let a = a.as_u8x16();
227    let b = b.as_u8x16();
228    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
229}
230
231/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
232/// minimum values.
233///
234/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
235#[inline]
236#[target_feature(enable = "sse2")]
237#[cfg_attr(test, assert_instr(pminsw))]
238#[stable(feature = "simd_x86", since = "1.27.0")]
239pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
240    let a = a.as_i16x8();
241    let b = b.as_i16x8();
242    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
243}
244
245/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
246/// packed minimum values.
247///
248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
249#[inline]
250#[target_feature(enable = "sse2")]
251#[cfg_attr(test, assert_instr(pminub))]
252#[stable(feature = "simd_x86", since = "1.27.0")]
253pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
254    let a = a.as_u8x16();
255    let b = b.as_u8x16();
256    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
257}
258
259/// Multiplies the packed 16-bit integers in `a` and `b`.
260///
261/// The multiplication produces intermediate 32-bit integers, and returns the
262/// high 16 bits of the intermediate integers.
263///
264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
265#[inline]
266#[target_feature(enable = "sse2")]
267#[cfg_attr(test, assert_instr(pmulhw))]
268#[stable(feature = "simd_x86", since = "1.27.0")]
269pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
270    let a = simd_cast::<_, i32x8>(a.as_i16x8());
271    let b = simd_cast::<_, i32x8>(b.as_i16x8());
272    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
273    transmute(simd_cast::<i32x8, i16x8>(r))
274}
275
276/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
277///
278/// The multiplication produces intermediate 32-bit integers, and returns the
279/// high 16 bits of the intermediate integers.
280///
281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
282#[inline]
283#[target_feature(enable = "sse2")]
284#[cfg_attr(test, assert_instr(pmulhuw))]
285#[stable(feature = "simd_x86", since = "1.27.0")]
286pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
287    let a = simd_cast::<_, u32x8>(a.as_u16x8());
288    let b = simd_cast::<_, u32x8>(b.as_u16x8());
289    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
290    transmute(simd_cast::<u32x8, u16x8>(r))
291}
292
293/// Multiplies the packed 16-bit integers in `a` and `b`.
294///
295/// The multiplication produces intermediate 32-bit integers, and returns the
296/// low 16 bits of the intermediate integers.
297///
298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
299#[inline]
300#[target_feature(enable = "sse2")]
301#[cfg_attr(test, assert_instr(pmullw))]
302#[stable(feature = "simd_x86", since = "1.27.0")]
303pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
304    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
305}
306
307/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
308/// in `a` and `b`.
309///
310/// Returns the unsigned 64-bit results.
311///
312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
313#[inline]
314#[target_feature(enable = "sse2")]
315#[cfg_attr(test, assert_instr(pmuludq))]
316#[stable(feature = "simd_x86", since = "1.27.0")]
317pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
318    let a = a.as_u64x2();
319    let b = b.as_u64x2();
320    let mask = u64x2::splat(u32::MAX.into());
321    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
322}
323
324/// Sum the absolute differences of packed unsigned 8-bit integers.
325///
326/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
327/// and `b`, then horizontally sum each consecutive 8 differences to produce
328/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
329/// the low 16 bits of 64-bit elements returned.
330///
331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
332#[inline]
333#[target_feature(enable = "sse2")]
334#[cfg_attr(test, assert_instr(psadbw))]
335#[stable(feature = "simd_x86", since = "1.27.0")]
336pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
337    transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
338}
339
340/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
341///
342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
343#[inline]
344#[target_feature(enable = "sse2")]
345#[cfg_attr(test, assert_instr(psubb))]
346#[stable(feature = "simd_x86", since = "1.27.0")]
347pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
348    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
349}
350
351/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
354#[inline]
355#[target_feature(enable = "sse2")]
356#[cfg_attr(test, assert_instr(psubw))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
359    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
360}
361
362/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
365#[inline]
366#[target_feature(enable = "sse2")]
367#[cfg_attr(test, assert_instr(psubd))]
368#[stable(feature = "simd_x86", since = "1.27.0")]
369pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
370    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
371}
372
373/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
374///
375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
376#[inline]
377#[target_feature(enable = "sse2")]
378#[cfg_attr(test, assert_instr(psubq))]
379#[stable(feature = "simd_x86", since = "1.27.0")]
380pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
381    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
382}
383
384/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
385/// using saturation.
386///
387/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
388#[inline]
389#[target_feature(enable = "sse2")]
390#[cfg_attr(test, assert_instr(psubsb))]
391#[stable(feature = "simd_x86", since = "1.27.0")]
392pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
393    transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16()))
394}
395
396/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
397/// using saturation.
398///
399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
400#[inline]
401#[target_feature(enable = "sse2")]
402#[cfg_attr(test, assert_instr(psubsw))]
403#[stable(feature = "simd_x86", since = "1.27.0")]
404pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
405    transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8()))
406}
407
408/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
409/// integers in `a` using saturation.
410///
411/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
412#[inline]
413#[target_feature(enable = "sse2")]
414#[cfg_attr(test, assert_instr(psubusb))]
415#[stable(feature = "simd_x86", since = "1.27.0")]
416pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
417    transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16()))
418}
419
420/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
421/// integers in `a` using saturation.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
424#[inline]
425#[target_feature(enable = "sse2")]
426#[cfg_attr(test, assert_instr(psubusw))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
429    transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8()))
430}
431
432/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
435#[inline]
436#[target_feature(enable = "sse2")]
437#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
438#[rustc_legacy_const_generics(1)]
439#[stable(feature = "simd_x86", since = "1.27.0")]
440pub unsafe fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
441    static_assert_uimm_bits!(IMM8, 8);
442    _mm_slli_si128_impl::<IMM8>(a)
443}
444
445/// Implementation detail: converts the immediate argument of the
446/// `_mm_slli_si128` intrinsic into a compile-time constant.
447#[inline]
448#[target_feature(enable = "sse2")]
449unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
450    const fn mask(shift: i32, i: u32) -> u32 {
451        let shift = shift as u32 & 0xff;
452        if shift > 15 {
453            i
454        } else {
455            16 - shift + i
456        }
457    }
458    transmute::<i8x16, _>(simd_shuffle!(
459        i8x16::ZERO,
460        a.as_i8x16(),
461        [
462            mask(IMM8, 0),
463            mask(IMM8, 1),
464            mask(IMM8, 2),
465            mask(IMM8, 3),
466            mask(IMM8, 4),
467            mask(IMM8, 5),
468            mask(IMM8, 6),
469            mask(IMM8, 7),
470            mask(IMM8, 8),
471            mask(IMM8, 9),
472            mask(IMM8, 10),
473            mask(IMM8, 11),
474            mask(IMM8, 12),
475            mask(IMM8, 13),
476            mask(IMM8, 14),
477            mask(IMM8, 15),
478        ],
479    ))
480}
481
482/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
483///
484/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
485#[inline]
486#[target_feature(enable = "sse2")]
487#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
488#[rustc_legacy_const_generics(1)]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub unsafe fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
491    static_assert_uimm_bits!(IMM8, 8);
492    _mm_slli_si128_impl::<IMM8>(a)
493}
494
495/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
496///
497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
498#[inline]
499#[target_feature(enable = "sse2")]
500#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
501#[rustc_legacy_const_generics(1)]
502#[stable(feature = "simd_x86", since = "1.27.0")]
503pub unsafe fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
504    static_assert_uimm_bits!(IMM8, 8);
505    _mm_srli_si128_impl::<IMM8>(a)
506}
507
508/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
509///
510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
511#[inline]
512#[target_feature(enable = "sse2")]
513#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
514#[rustc_legacy_const_generics(1)]
515#[stable(feature = "simd_x86", since = "1.27.0")]
516pub unsafe fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
517    static_assert_uimm_bits!(IMM8, 8);
518    if IMM8 >= 16 {
519        _mm_setzero_si128()
520    } else {
521        transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
522    }
523}
524
525/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
526/// zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(psllw))]
532#[stable(feature = "simd_x86", since = "1.27.0")]
533pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
534    transmute(psllw(a.as_i16x8(), count.as_i16x8()))
535}
536
537/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
540#[inline]
541#[target_feature(enable = "sse2")]
542#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
543#[rustc_legacy_const_generics(1)]
544#[stable(feature = "simd_x86", since = "1.27.0")]
545pub unsafe fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
546    static_assert_uimm_bits!(IMM8, 8);
547    if IMM8 >= 32 {
548        _mm_setzero_si128()
549    } else {
550        transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
551    }
552}
553
554/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
555/// zeros.
556///
557/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
558#[inline]
559#[target_feature(enable = "sse2")]
560#[cfg_attr(test, assert_instr(pslld))]
561#[stable(feature = "simd_x86", since = "1.27.0")]
562pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
563    transmute(pslld(a.as_i32x4(), count.as_i32x4()))
564}
565
566/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
569#[inline]
570#[target_feature(enable = "sse2")]
571#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
572#[rustc_legacy_const_generics(1)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub unsafe fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
575    static_assert_uimm_bits!(IMM8, 8);
576    if IMM8 >= 64 {
577        _mm_setzero_si128()
578    } else {
579        transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
580    }
581}
582
583/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
584/// zeros.
585///
586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
587#[inline]
588#[target_feature(enable = "sse2")]
589#[cfg_attr(test, assert_instr(psllq))]
590#[stable(feature = "simd_x86", since = "1.27.0")]
591pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
592    transmute(psllq(a.as_i64x2(), count.as_i64x2()))
593}
594
595/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
596/// bits.
597///
598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
599#[inline]
600#[target_feature(enable = "sse2")]
601#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
602#[rustc_legacy_const_generics(1)]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604pub unsafe fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
605    static_assert_uimm_bits!(IMM8, 8);
606    transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)))
607}
608
609/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
610/// bits.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
613#[inline]
614#[target_feature(enable = "sse2")]
615#[cfg_attr(test, assert_instr(psraw))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
618    transmute(psraw(a.as_i16x8(), count.as_i16x8()))
619}
620
621/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
622/// bits.
623///
624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
625#[inline]
626#[target_feature(enable = "sse2")]
627#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
628#[rustc_legacy_const_generics(1)]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630pub unsafe fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
631    static_assert_uimm_bits!(IMM8, 8);
632    transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))))
633}
634
635/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
636/// bits.
637///
638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
639#[inline]
640#[target_feature(enable = "sse2")]
641#[cfg_attr(test, assert_instr(psrad))]
642#[stable(feature = "simd_x86", since = "1.27.0")]
643pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
644    transmute(psrad(a.as_i32x4(), count.as_i32x4()))
645}
646
647/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
650#[inline]
651#[target_feature(enable = "sse2")]
652#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
653#[rustc_legacy_const_generics(1)]
654#[stable(feature = "simd_x86", since = "1.27.0")]
655pub unsafe fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
656    static_assert_uimm_bits!(IMM8, 8);
657    _mm_srli_si128_impl::<IMM8>(a)
658}
659
660/// Implementation detail: converts the immediate argument of the
661/// `_mm_srli_si128` intrinsic into a compile-time constant.
662#[inline]
663#[target_feature(enable = "sse2")]
664unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
665    const fn mask(shift: i32, i: u32) -> u32 {
666        if (shift as u32) > 15 {
667            i + 16
668        } else {
669            i + (shift as u32)
670        }
671    }
672    let x: i8x16 = simd_shuffle!(
673        a.as_i8x16(),
674        i8x16::ZERO,
675        [
676            mask(IMM8, 0),
677            mask(IMM8, 1),
678            mask(IMM8, 2),
679            mask(IMM8, 3),
680            mask(IMM8, 4),
681            mask(IMM8, 5),
682            mask(IMM8, 6),
683            mask(IMM8, 7),
684            mask(IMM8, 8),
685            mask(IMM8, 9),
686            mask(IMM8, 10),
687            mask(IMM8, 11),
688            mask(IMM8, 12),
689            mask(IMM8, 13),
690            mask(IMM8, 14),
691            mask(IMM8, 15),
692        ],
693    );
694    transmute(x)
695}
696
697/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
698/// zeros.
699///
700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
701#[inline]
702#[target_feature(enable = "sse2")]
703#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
704#[rustc_legacy_const_generics(1)]
705#[stable(feature = "simd_x86", since = "1.27.0")]
706pub unsafe fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
707    static_assert_uimm_bits!(IMM8, 8);
708    if IMM8 >= 16 {
709        _mm_setzero_si128()
710    } else {
711        transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
712    }
713}
714
715/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
716/// zeros.
717///
718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
719#[inline]
720#[target_feature(enable = "sse2")]
721#[cfg_attr(test, assert_instr(psrlw))]
722#[stable(feature = "simd_x86", since = "1.27.0")]
723pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
724    transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
725}
726
727/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
728/// zeros.
729///
730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
731#[inline]
732#[target_feature(enable = "sse2")]
733#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
734#[rustc_legacy_const_generics(1)]
735#[stable(feature = "simd_x86", since = "1.27.0")]
736pub unsafe fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
737    static_assert_uimm_bits!(IMM8, 8);
738    if IMM8 >= 32 {
739        _mm_setzero_si128()
740    } else {
741        transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
742    }
743}
744
745/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
746/// zeros.
747///
748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
749#[inline]
750#[target_feature(enable = "sse2")]
751#[cfg_attr(test, assert_instr(psrld))]
752#[stable(feature = "simd_x86", since = "1.27.0")]
753pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
754    transmute(psrld(a.as_i32x4(), count.as_i32x4()))
755}
756
757/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
758/// zeros.
759///
760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
761#[inline]
762#[target_feature(enable = "sse2")]
763#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
764#[rustc_legacy_const_generics(1)]
765#[stable(feature = "simd_x86", since = "1.27.0")]
766pub unsafe fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
767    static_assert_uimm_bits!(IMM8, 8);
768    if IMM8 >= 64 {
769        _mm_setzero_si128()
770    } else {
771        transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
772    }
773}
774
775/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
776/// zeros.
777///
778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(psrlq))]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
784    transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
785}
786
787/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
788/// `b`.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(andps))]
794#[stable(feature = "simd_x86", since = "1.27.0")]
795pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
796    simd_and(a, b)
797}
798
799/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
800/// then AND with `b`.
801///
802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
803#[inline]
804#[target_feature(enable = "sse2")]
805#[cfg_attr(test, assert_instr(andnps))]
806#[stable(feature = "simd_x86", since = "1.27.0")]
807pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
808    simd_and(simd_xor(_mm_set1_epi8(-1), a), b)
809}
810
811/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
812/// `b`.
813///
814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
815#[inline]
816#[target_feature(enable = "sse2")]
817#[cfg_attr(test, assert_instr(orps))]
818#[stable(feature = "simd_x86", since = "1.27.0")]
819pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
820    simd_or(a, b)
821}
822
823/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
824/// `b`.
825///
826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
827#[inline]
828#[target_feature(enable = "sse2")]
829#[cfg_attr(test, assert_instr(xorps))]
830#[stable(feature = "simd_x86", since = "1.27.0")]
831pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
832    simd_xor(a, b)
833}
834
835/// Compares packed 8-bit integers in `a` and `b` for equality.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
838#[inline]
839#[target_feature(enable = "sse2")]
840#[cfg_attr(test, assert_instr(pcmpeqb))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
843    transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
844}
845
846/// Compares packed 16-bit integers in `a` and `b` for equality.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
849#[inline]
850#[target_feature(enable = "sse2")]
851#[cfg_attr(test, assert_instr(pcmpeqw))]
852#[stable(feature = "simd_x86", since = "1.27.0")]
853pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
854    transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
855}
856
857/// Compares packed 32-bit integers in `a` and `b` for equality.
858///
859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
860#[inline]
861#[target_feature(enable = "sse2")]
862#[cfg_attr(test, assert_instr(pcmpeqd))]
863#[stable(feature = "simd_x86", since = "1.27.0")]
864pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
865    transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
866}
867
868/// Compares packed 8-bit integers in `a` and `b` for greater-than.
869///
870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
871#[inline]
872#[target_feature(enable = "sse2")]
873#[cfg_attr(test, assert_instr(pcmpgtb))]
874#[stable(feature = "simd_x86", since = "1.27.0")]
875pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
876    transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
877}
878
879/// Compares packed 16-bit integers in `a` and `b` for greater-than.
880///
881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
882#[inline]
883#[target_feature(enable = "sse2")]
884#[cfg_attr(test, assert_instr(pcmpgtw))]
885#[stable(feature = "simd_x86", since = "1.27.0")]
886pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
887    transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
888}
889
890/// Compares packed 32-bit integers in `a` and `b` for greater-than.
891///
892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
893#[inline]
894#[target_feature(enable = "sse2")]
895#[cfg_attr(test, assert_instr(pcmpgtd))]
896#[stable(feature = "simd_x86", since = "1.27.0")]
897pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
898    transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
899}
900
901/// Compares packed 8-bit integers in `a` and `b` for less-than.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
904#[inline]
905#[target_feature(enable = "sse2")]
906#[cfg_attr(test, assert_instr(pcmpgtb))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
909    transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
910}
911
912/// Compares packed 16-bit integers in `a` and `b` for less-than.
913///
914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
915#[inline]
916#[target_feature(enable = "sse2")]
917#[cfg_attr(test, assert_instr(pcmpgtw))]
918#[stable(feature = "simd_x86", since = "1.27.0")]
919pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
920    transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
921}
922
923/// Compares packed 32-bit integers in `a` and `b` for less-than.
924///
925/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
926#[inline]
927#[target_feature(enable = "sse2")]
928#[cfg_attr(test, assert_instr(pcmpgtd))]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
931    transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
932}
933
934/// Converts the lower two packed 32-bit integers in `a` to packed
935/// double-precision (64-bit) floating-point elements.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
938#[inline]
939#[target_feature(enable = "sse2")]
940#[cfg_attr(test, assert_instr(cvtdq2pd))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
943    let a = a.as_i32x4();
944    simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
945}
946
947/// Returns `a` with its lower element replaced by `b` after converting it to
948/// an `f64`.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
951#[inline]
952#[target_feature(enable = "sse2")]
953#[cfg_attr(test, assert_instr(cvtsi2sd))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
956    simd_insert!(a, 0, b as f64)
957}
958
959/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
960/// floating-point elements.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
963#[inline]
964#[target_feature(enable = "sse2")]
965#[cfg_attr(test, assert_instr(cvtdq2ps))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
968    transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
969}
970
971/// Converts packed single-precision (32-bit) floating-point elements in `a`
972/// to packed 32-bit integers.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
975#[inline]
976#[target_feature(enable = "sse2")]
977#[cfg_attr(test, assert_instr(cvtps2dq))]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
980    transmute(cvtps2dq(a))
981}
982
983/// Returns a vector whose lowest element is `a` and all higher elements are
984/// `0`.
985///
986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
987#[inline]
988#[target_feature(enable = "sse2")]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
991    transmute(i32x4::new(a, 0, 0, 0))
992}
993
994/// Returns the lowest element of `a`.
995///
996/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
997#[inline]
998#[target_feature(enable = "sse2")]
999#[stable(feature = "simd_x86", since = "1.27.0")]
1000pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1001    simd_extract!(a.as_i32x4(), 0)
1002}
1003
1004/// Sets packed 64-bit integers with the supplied values, from highest to
1005/// lowest.
1006///
1007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1008#[inline]
1009#[target_feature(enable = "sse2")]
1010// no particular instruction to test
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1013    transmute(i64x2::new(e0, e1))
1014}
1015
1016/// Sets packed 32-bit integers with the supplied values.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021// no particular instruction to test
1022#[stable(feature = "simd_x86", since = "1.27.0")]
1023pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1024    transmute(i32x4::new(e0, e1, e2, e3))
1025}
1026
1027/// Sets packed 16-bit integers with the supplied values.
1028///
1029/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1030#[inline]
1031#[target_feature(enable = "sse2")]
1032// no particular instruction to test
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub unsafe fn _mm_set_epi16(
1035    e7: i16,
1036    e6: i16,
1037    e5: i16,
1038    e4: i16,
1039    e3: i16,
1040    e2: i16,
1041    e1: i16,
1042    e0: i16,
1043) -> __m128i {
1044    transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
1045}
1046
1047/// Sets packed 8-bit integers with the supplied values.
1048///
1049/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1050#[inline]
1051#[target_feature(enable = "sse2")]
1052// no particular instruction to test
1053#[stable(feature = "simd_x86", since = "1.27.0")]
1054pub unsafe fn _mm_set_epi8(
1055    e15: i8,
1056    e14: i8,
1057    e13: i8,
1058    e12: i8,
1059    e11: i8,
1060    e10: i8,
1061    e9: i8,
1062    e8: i8,
1063    e7: i8,
1064    e6: i8,
1065    e5: i8,
1066    e4: i8,
1067    e3: i8,
1068    e2: i8,
1069    e1: i8,
1070    e0: i8,
1071) -> __m128i {
1072    #[rustfmt::skip]
1073    transmute(i8x16::new(
1074        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1075    ))
1076}
1077
1078/// Broadcasts 64-bit integer `a` to all elements.
1079///
1080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1081#[inline]
1082#[target_feature(enable = "sse2")]
1083// no particular instruction to test
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
1086    _mm_set_epi64x(a, a)
1087}
1088
1089/// Broadcasts 32-bit integer `a` to all elements.
1090///
1091/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1092#[inline]
1093#[target_feature(enable = "sse2")]
1094// no particular instruction to test
1095#[stable(feature = "simd_x86", since = "1.27.0")]
1096pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
1097    _mm_set_epi32(a, a, a, a)
1098}
1099
1100/// Broadcasts 16-bit integer `a` to all elements.
1101///
1102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1103#[inline]
1104#[target_feature(enable = "sse2")]
1105// no particular instruction to test
1106#[stable(feature = "simd_x86", since = "1.27.0")]
1107pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
1108    _mm_set_epi16(a, a, a, a, a, a, a, a)
1109}
1110
1111/// Broadcasts 8-bit integer `a` to all elements.
1112///
1113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1114#[inline]
1115#[target_feature(enable = "sse2")]
1116// no particular instruction to test
1117#[stable(feature = "simd_x86", since = "1.27.0")]
1118pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
1119    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1120}
1121
1122/// Sets packed 32-bit integers with the supplied values in reverse order.
1123///
1124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1125#[inline]
1126#[target_feature(enable = "sse2")]
1127// no particular instruction to test
1128#[stable(feature = "simd_x86", since = "1.27.0")]
1129pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1130    _mm_set_epi32(e0, e1, e2, e3)
1131}
1132
1133/// Sets packed 16-bit integers with the supplied values in reverse order.
1134///
1135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1136#[inline]
1137#[target_feature(enable = "sse2")]
1138// no particular instruction to test
1139#[stable(feature = "simd_x86", since = "1.27.0")]
1140pub unsafe fn _mm_setr_epi16(
1141    e7: i16,
1142    e6: i16,
1143    e5: i16,
1144    e4: i16,
1145    e3: i16,
1146    e2: i16,
1147    e1: i16,
1148    e0: i16,
1149) -> __m128i {
1150    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1151}
1152
1153/// Sets packed 8-bit integers with the supplied values in reverse order.
1154///
1155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1156#[inline]
1157#[target_feature(enable = "sse2")]
1158// no particular instruction to test
1159#[stable(feature = "simd_x86", since = "1.27.0")]
1160pub unsafe fn _mm_setr_epi8(
1161    e15: i8,
1162    e14: i8,
1163    e13: i8,
1164    e12: i8,
1165    e11: i8,
1166    e10: i8,
1167    e9: i8,
1168    e8: i8,
1169    e7: i8,
1170    e6: i8,
1171    e5: i8,
1172    e4: i8,
1173    e3: i8,
1174    e2: i8,
1175    e1: i8,
1176    e0: i8,
1177) -> __m128i {
1178    #[rustfmt::skip]
1179    _mm_set_epi8(
1180        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1181    )
1182}
1183
1184/// Returns a vector with all elements set to zero.
1185///
1186/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1187#[inline]
1188#[target_feature(enable = "sse2")]
1189#[cfg_attr(test, assert_instr(xorps))]
1190#[stable(feature = "simd_x86", since = "1.27.0")]
1191pub unsafe fn _mm_setzero_si128() -> __m128i {
1192    const { mem::zeroed() }
1193}
1194
1195/// Loads 64-bit integer from memory into first element of returned vector.
1196///
1197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1198#[inline]
1199#[target_feature(enable = "sse2")]
1200#[stable(feature = "simd_x86", since = "1.27.0")]
1201pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1202    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1203}
1204
1205/// Loads 128-bits of integer data from memory into a new vector.
1206///
1207/// `mem_addr` must be aligned on a 16-byte boundary.
1208///
1209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1210#[inline]
1211#[target_feature(enable = "sse2")]
1212#[cfg_attr(test, assert_instr(movaps))]
1213#[stable(feature = "simd_x86", since = "1.27.0")]
1214pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1215    *mem_addr
1216}
1217
1218/// Loads 128-bits of integer data from memory into a new vector.
1219///
1220/// `mem_addr` does not need to be aligned on any particular boundary.
1221///
1222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1223#[inline]
1224#[target_feature(enable = "sse2")]
1225#[cfg_attr(test, assert_instr(movups))]
1226#[stable(feature = "simd_x86", since = "1.27.0")]
1227pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1228    let mut dst: __m128i = _mm_undefined_si128();
1229    ptr::copy_nonoverlapping(
1230        mem_addr as *const u8,
1231        ptr::addr_of_mut!(dst) as *mut u8,
1232        mem::size_of::<__m128i>(),
1233    );
1234    dst
1235}
1236
1237/// Conditionally store 8-bit integer elements from `a` into memory using
1238/// `mask`.
1239///
1240/// Elements are not stored when the highest bit is not set in the
1241/// corresponding element.
1242///
1243/// `mem_addr` should correspond to a 128-bit memory location and does not need
1244/// to be aligned on any particular boundary.
1245///
1246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1247#[inline]
1248#[target_feature(enable = "sse2")]
1249#[cfg_attr(test, assert_instr(maskmovdqu))]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1252    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1253}
1254
1255/// Stores 128-bits of integer data from `a` into memory.
1256///
1257/// `mem_addr` must be aligned on a 16-byte boundary.
1258///
1259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1260#[inline]
1261#[target_feature(enable = "sse2")]
1262#[cfg_attr(test, assert_instr(movaps))]
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1265    *mem_addr = a;
1266}
1267
1268/// Stores 128-bits of integer data from `a` into memory.
1269///
1270/// `mem_addr` does not need to be aligned on any particular boundary.
1271///
1272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1273#[inline]
1274#[target_feature(enable = "sse2")]
1275#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1276#[stable(feature = "simd_x86", since = "1.27.0")]
1277pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1278    mem_addr.write_unaligned(a);
1279}
1280
1281/// Stores the lower 64-bit integer `a` to a memory location.
1282///
1283/// `mem_addr` does not need to be aligned on any particular boundary.
1284///
1285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1286#[inline]
1287#[target_feature(enable = "sse2")]
1288#[stable(feature = "simd_x86", since = "1.27.0")]
1289pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1290    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1291}
1292
1293/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1294/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1295/// used again soon).
1296///
1297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1298///
1299/// # Safety of non-temporal stores
1300///
1301/// After using this intrinsic, but before any other access to the memory that this intrinsic
1302/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1303/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1304/// return.
1305///
1306/// See [`_mm_sfence`] for details.
1307#[inline]
1308#[target_feature(enable = "sse2")]
1309#[cfg_attr(test, assert_instr(movntdq))]
1310#[stable(feature = "simd_x86", since = "1.27.0")]
1311pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1312    crate::arch::asm!(
1313        vps!("movntdq",  ",{a}"),
1314        p = in(reg) mem_addr,
1315        a = in(xmm_reg) a,
1316        options(nostack, preserves_flags),
1317    );
1318}
1319
1320/// Stores a 32-bit integer value in the specified memory location.
1321/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1322/// used again soon).
1323///
1324/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1325///
1326/// # Safety of non-temporal stores
1327///
1328/// After using this intrinsic, but before any other access to the memory that this intrinsic
1329/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1330/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1331/// return.
1332///
1333/// See [`_mm_sfence`] for details.
1334#[inline]
1335#[target_feature(enable = "sse2")]
1336#[cfg_attr(test, assert_instr(movnti))]
1337#[stable(feature = "simd_x86", since = "1.27.0")]
1338pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1339    crate::arch::asm!(
1340        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1341        p = in(reg) mem_addr,
1342        a = in(reg) a,
1343        options(nostack, preserves_flags),
1344    );
1345}
1346
1347/// Returns a vector where the low element is extracted from `a` and its upper
1348/// element is zero.
1349///
1350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1351#[inline]
1352#[target_feature(enable = "sse2")]
1353// FIXME movd on msvc, movd on i686
1354#[cfg_attr(
1355    all(test, not(target_env = "msvc"), target_arch = "x86_64"),
1356    assert_instr(movq)
1357)]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
1360    let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1361    transmute(r)
1362}
1363
1364/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1365/// using signed saturation.
1366///
1367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1368#[inline]
1369#[target_feature(enable = "sse2")]
1370#[cfg_attr(test, assert_instr(packsswb))]
1371#[stable(feature = "simd_x86", since = "1.27.0")]
1372pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1373    transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
1374}
1375
1376/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1377/// using signed saturation.
1378///
1379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1380#[inline]
1381#[target_feature(enable = "sse2")]
1382#[cfg_attr(test, assert_instr(packssdw))]
1383#[stable(feature = "simd_x86", since = "1.27.0")]
1384pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1385    transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
1386}
1387
1388/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1389/// using unsigned saturation.
1390///
1391/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1392#[inline]
1393#[target_feature(enable = "sse2")]
1394#[cfg_attr(test, assert_instr(packuswb))]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1397    transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
1398}
1399
1400/// Returns the `imm8` element of `a`.
1401///
1402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1403#[inline]
1404#[target_feature(enable = "sse2")]
1405#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1406#[rustc_legacy_const_generics(1)]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub unsafe fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1409    static_assert_uimm_bits!(IMM8, 3);
1410    simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32
1411}
1412
1413/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1416#[inline]
1417#[target_feature(enable = "sse2")]
1418#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1419#[rustc_legacy_const_generics(2)]
1420#[stable(feature = "simd_x86", since = "1.27.0")]
1421pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1422    static_assert_uimm_bits!(IMM8, 3);
1423    transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16))
1424}
1425
1426/// Returns a mask of the most significant bit of each element in `a`.
1427///
1428/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1429#[inline]
1430#[target_feature(enable = "sse2")]
1431#[cfg_attr(test, assert_instr(pmovmskb))]
1432#[stable(feature = "simd_x86", since = "1.27.0")]
1433pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
1434    let z = i8x16::ZERO;
1435    let m: i8x16 = simd_lt(a.as_i8x16(), z);
1436    simd_bitmask::<_, u16>(m) as u32 as i32
1437}
1438
1439/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1440///
1441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1442#[inline]
1443#[target_feature(enable = "sse2")]
1444#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1445#[rustc_legacy_const_generics(1)]
1446#[stable(feature = "simd_x86", since = "1.27.0")]
1447pub unsafe fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1448    static_assert_uimm_bits!(IMM8, 8);
1449    let a = a.as_i32x4();
1450    let x: i32x4 = simd_shuffle!(
1451        a,
1452        a,
1453        [
1454            IMM8 as u32 & 0b11,
1455            (IMM8 as u32 >> 2) & 0b11,
1456            (IMM8 as u32 >> 4) & 0b11,
1457            (IMM8 as u32 >> 6) & 0b11,
1458        ],
1459    );
1460    transmute(x)
1461}
1462
1463/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1464/// `IMM8`.
1465///
1466/// Put the results in the high 64 bits of the returned vector, with the low 64
1467/// bits being copied from `a`.
1468///
1469/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1470#[inline]
1471#[target_feature(enable = "sse2")]
1472#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1473#[rustc_legacy_const_generics(1)]
1474#[stable(feature = "simd_x86", since = "1.27.0")]
1475pub unsafe fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1476    static_assert_uimm_bits!(IMM8, 8);
1477    let a = a.as_i16x8();
1478    let x: i16x8 = simd_shuffle!(
1479        a,
1480        a,
1481        [
1482            0,
1483            1,
1484            2,
1485            3,
1486            (IMM8 as u32 & 0b11) + 4,
1487            ((IMM8 as u32 >> 2) & 0b11) + 4,
1488            ((IMM8 as u32 >> 4) & 0b11) + 4,
1489            ((IMM8 as u32 >> 6) & 0b11) + 4,
1490        ],
1491    );
1492    transmute(x)
1493}
1494
1495/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1496/// `IMM8`.
1497///
1498/// Put the results in the low 64 bits of the returned vector, with the high 64
1499/// bits being copied from `a`.
1500///
1501/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1502#[inline]
1503#[target_feature(enable = "sse2")]
1504#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1505#[rustc_legacy_const_generics(1)]
1506#[stable(feature = "simd_x86", since = "1.27.0")]
1507pub unsafe fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1508    static_assert_uimm_bits!(IMM8, 8);
1509    let a = a.as_i16x8();
1510    let x: i16x8 = simd_shuffle!(
1511        a,
1512        a,
1513        [
1514            IMM8 as u32 & 0b11,
1515            (IMM8 as u32 >> 2) & 0b11,
1516            (IMM8 as u32 >> 4) & 0b11,
1517            (IMM8 as u32 >> 6) & 0b11,
1518            4,
1519            5,
1520            6,
1521            7,
1522        ],
1523    );
1524    transmute(x)
1525}
1526
1527/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1528///
1529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1530#[inline]
1531#[target_feature(enable = "sse2")]
1532#[cfg_attr(test, assert_instr(punpckhbw))]
1533#[stable(feature = "simd_x86", since = "1.27.0")]
1534pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1535    transmute::<i8x16, _>(simd_shuffle!(
1536        a.as_i8x16(),
1537        b.as_i8x16(),
1538        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1539    ))
1540}
1541
1542/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1543///
1544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1545#[inline]
1546#[target_feature(enable = "sse2")]
1547#[cfg_attr(test, assert_instr(punpckhwd))]
1548#[stable(feature = "simd_x86", since = "1.27.0")]
1549pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1550    let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1551    transmute::<i16x8, _>(x)
1552}
1553
1554/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1555///
1556/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1557#[inline]
1558#[target_feature(enable = "sse2")]
1559#[cfg_attr(test, assert_instr(unpckhps))]
1560#[stable(feature = "simd_x86", since = "1.27.0")]
1561pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1562    transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
1563}
1564
1565/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1566///
1567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1568#[inline]
1569#[target_feature(enable = "sse2")]
1570#[cfg_attr(test, assert_instr(unpckhpd))]
1571#[stable(feature = "simd_x86", since = "1.27.0")]
1572pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1573    transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3]))
1574}
1575
1576/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1577///
1578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1579#[inline]
1580#[target_feature(enable = "sse2")]
1581#[cfg_attr(test, assert_instr(punpcklbw))]
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1584    transmute::<i8x16, _>(simd_shuffle!(
1585        a.as_i8x16(),
1586        b.as_i8x16(),
1587        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1588    ))
1589}
1590
1591/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1592///
1593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1594#[inline]
1595#[target_feature(enable = "sse2")]
1596#[cfg_attr(test, assert_instr(punpcklwd))]
1597#[stable(feature = "simd_x86", since = "1.27.0")]
1598pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1599    let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1600    transmute::<i16x8, _>(x)
1601}
1602
1603/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1604///
1605/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1606#[inline]
1607#[target_feature(enable = "sse2")]
1608#[cfg_attr(test, assert_instr(unpcklps))]
1609#[stable(feature = "simd_x86", since = "1.27.0")]
1610pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1611    transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
1612}
1613
1614/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1615///
1616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1617#[inline]
1618#[target_feature(enable = "sse2")]
1619#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
1620#[stable(feature = "simd_x86", since = "1.27.0")]
1621pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1622    transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2]))
1623}
1624
1625/// Returns a new vector with the low element of `a` replaced by the sum of the
1626/// low elements of `a` and `b`.
1627///
1628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1629#[inline]
1630#[target_feature(enable = "sse2")]
1631#[cfg_attr(test, assert_instr(addsd))]
1632#[stable(feature = "simd_x86", since = "1.27.0")]
1633pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1634    simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
1635}
1636
1637/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1638/// `b`.
1639///
1640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1641#[inline]
1642#[target_feature(enable = "sse2")]
1643#[cfg_attr(test, assert_instr(addpd))]
1644#[stable(feature = "simd_x86", since = "1.27.0")]
1645pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1646    simd_add(a, b)
1647}
1648
1649/// Returns a new vector with the low element of `a` replaced by the result of
1650/// diving the lower element of `a` by the lower element of `b`.
1651///
1652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1653#[inline]
1654#[target_feature(enable = "sse2")]
1655#[cfg_attr(test, assert_instr(divsd))]
1656#[stable(feature = "simd_x86", since = "1.27.0")]
1657pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1658    simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
1659}
1660
1661/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1662/// packed elements in `b`.
1663///
1664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1665#[inline]
1666#[target_feature(enable = "sse2")]
1667#[cfg_attr(test, assert_instr(divpd))]
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1670    simd_div(a, b)
1671}
1672
1673/// Returns a new vector with the low element of `a` replaced by the maximum
1674/// of the lower elements of `a` and `b`.
1675///
1676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1677#[inline]
1678#[target_feature(enable = "sse2")]
1679#[cfg_attr(test, assert_instr(maxsd))]
1680#[stable(feature = "simd_x86", since = "1.27.0")]
1681pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1682    maxsd(a, b)
1683}
1684
1685/// Returns a new vector with the maximum values from corresponding elements in
1686/// `a` and `b`.
1687///
1688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1689#[inline]
1690#[target_feature(enable = "sse2")]
1691#[cfg_attr(test, assert_instr(maxpd))]
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1694    maxpd(a, b)
1695}
1696
1697/// Returns a new vector with the low element of `a` replaced by the minimum
1698/// of the lower elements of `a` and `b`.
1699///
1700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1701#[inline]
1702#[target_feature(enable = "sse2")]
1703#[cfg_attr(test, assert_instr(minsd))]
1704#[stable(feature = "simd_x86", since = "1.27.0")]
1705pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1706    minsd(a, b)
1707}
1708
1709/// Returns a new vector with the minimum values from corresponding elements in
1710/// `a` and `b`.
1711///
1712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1713#[inline]
1714#[target_feature(enable = "sse2")]
1715#[cfg_attr(test, assert_instr(minpd))]
1716#[stable(feature = "simd_x86", since = "1.27.0")]
1717pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1718    minpd(a, b)
1719}
1720
1721/// Returns a new vector with the low element of `a` replaced by multiplying the
1722/// low elements of `a` and `b`.
1723///
1724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1725#[inline]
1726#[target_feature(enable = "sse2")]
1727#[cfg_attr(test, assert_instr(mulsd))]
1728#[stable(feature = "simd_x86", since = "1.27.0")]
1729pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1730    simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
1731}
1732
1733/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1734/// and `b`.
1735///
1736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1737#[inline]
1738#[target_feature(enable = "sse2")]
1739#[cfg_attr(test, assert_instr(mulpd))]
1740#[stable(feature = "simd_x86", since = "1.27.0")]
1741pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1742    simd_mul(a, b)
1743}
1744
1745/// Returns a new vector with the low element of `a` replaced by the square
1746/// root of the lower element `b`.
1747///
1748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1749#[inline]
1750#[target_feature(enable = "sse2")]
1751#[cfg_attr(test, assert_instr(sqrtsd))]
1752#[stable(feature = "simd_x86", since = "1.27.0")]
1753pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1754    simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b)))
1755}
1756
1757/// Returns a new vector with the square root of each of the values in `a`.
1758///
1759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1760#[inline]
1761#[target_feature(enable = "sse2")]
1762#[cfg_attr(test, assert_instr(sqrtpd))]
1763#[stable(feature = "simd_x86", since = "1.27.0")]
1764pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1765    simd_fsqrt(a)
1766}
1767
1768/// Returns a new vector with the low element of `a` replaced by subtracting the
1769/// low element by `b` from the low element of `a`.
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1772#[inline]
1773#[target_feature(enable = "sse2")]
1774#[cfg_attr(test, assert_instr(subsd))]
1775#[stable(feature = "simd_x86", since = "1.27.0")]
1776pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1777    simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
1778}
1779
1780/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1781/// from `a`.
1782///
1783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1784#[inline]
1785#[target_feature(enable = "sse2")]
1786#[cfg_attr(test, assert_instr(subpd))]
1787#[stable(feature = "simd_x86", since = "1.27.0")]
1788pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1789    simd_sub(a, b)
1790}
1791
1792/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1793/// elements in `a` and `b`.
1794///
1795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1796#[inline]
1797#[target_feature(enable = "sse2")]
1798#[cfg_attr(test, assert_instr(andps))]
1799#[stable(feature = "simd_x86", since = "1.27.0")]
1800pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1801    let a: __m128i = transmute(a);
1802    let b: __m128i = transmute(b);
1803    transmute(_mm_and_si128(a, b))
1804}
1805
1806/// Computes the bitwise NOT of `a` and then AND with `b`.
1807///
1808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1809#[inline]
1810#[target_feature(enable = "sse2")]
1811#[cfg_attr(test, assert_instr(andnps))]
1812#[stable(feature = "simd_x86", since = "1.27.0")]
1813pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1814    let a: __m128i = transmute(a);
1815    let b: __m128i = transmute(b);
1816    transmute(_mm_andnot_si128(a, b))
1817}
1818
1819/// Computes the bitwise OR of `a` and `b`.
1820///
1821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1822#[inline]
1823#[target_feature(enable = "sse2")]
1824#[cfg_attr(test, assert_instr(orps))]
1825#[stable(feature = "simd_x86", since = "1.27.0")]
1826pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1827    let a: __m128i = transmute(a);
1828    let b: __m128i = transmute(b);
1829    transmute(_mm_or_si128(a, b))
1830}
1831
1832/// Computes the bitwise XOR of `a` and `b`.
1833///
1834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1835#[inline]
1836#[target_feature(enable = "sse2")]
1837#[cfg_attr(test, assert_instr(xorps))]
1838#[stable(feature = "simd_x86", since = "1.27.0")]
1839pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1840    let a: __m128i = transmute(a);
1841    let b: __m128i = transmute(b);
1842    transmute(_mm_xor_si128(a, b))
1843}
1844
1845/// Returns a new vector with the low element of `a` replaced by the equality
1846/// comparison of the lower elements of `a` and `b`.
1847///
1848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1849#[inline]
1850#[target_feature(enable = "sse2")]
1851#[cfg_attr(test, assert_instr(cmpeqsd))]
1852#[stable(feature = "simd_x86", since = "1.27.0")]
1853pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1854    cmpsd(a, b, 0)
1855}
1856
1857/// Returns a new vector with the low element of `a` replaced by the less-than
1858/// comparison of the lower elements of `a` and `b`.
1859///
1860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1861#[inline]
1862#[target_feature(enable = "sse2")]
1863#[cfg_attr(test, assert_instr(cmpltsd))]
1864#[stable(feature = "simd_x86", since = "1.27.0")]
1865pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1866    cmpsd(a, b, 1)
1867}
1868
1869/// Returns a new vector with the low element of `a` replaced by the
1870/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1871///
1872/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1873#[inline]
1874#[target_feature(enable = "sse2")]
1875#[cfg_attr(test, assert_instr(cmplesd))]
1876#[stable(feature = "simd_x86", since = "1.27.0")]
1877pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1878    cmpsd(a, b, 2)
1879}
1880
1881/// Returns a new vector with the low element of `a` replaced by the
1882/// greater-than comparison of the lower elements of `a` and `b`.
1883///
1884/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1885#[inline]
1886#[target_feature(enable = "sse2")]
1887#[cfg_attr(test, assert_instr(cmpltsd))]
1888#[stable(feature = "simd_x86", since = "1.27.0")]
1889pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1890    simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64))
1891}
1892
1893/// Returns a new vector with the low element of `a` replaced by the
1894/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1895///
1896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1897#[inline]
1898#[target_feature(enable = "sse2")]
1899#[cfg_attr(test, assert_instr(cmplesd))]
1900#[stable(feature = "simd_x86", since = "1.27.0")]
1901pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1902    simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64))
1903}
1904
1905/// Returns a new vector with the low element of `a` replaced by the result
1906/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1907/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1908/// otherwise.
1909///
1910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1911#[inline]
1912#[target_feature(enable = "sse2")]
1913#[cfg_attr(test, assert_instr(cmpordsd))]
1914#[stable(feature = "simd_x86", since = "1.27.0")]
1915pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1916    cmpsd(a, b, 7)
1917}
1918
1919/// Returns a new vector with the low element of `a` replaced by the result of
1920/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1921/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
1922///
1923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
1924#[inline]
1925#[target_feature(enable = "sse2")]
1926#[cfg_attr(test, assert_instr(cmpunordsd))]
1927#[stable(feature = "simd_x86", since = "1.27.0")]
1928pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
1929    cmpsd(a, b, 3)
1930}
1931
1932/// Returns a new vector with the low element of `a` replaced by the not-equal
1933/// comparison of the lower elements of `a` and `b`.
1934///
1935/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
1936#[inline]
1937#[target_feature(enable = "sse2")]
1938#[cfg_attr(test, assert_instr(cmpneqsd))]
1939#[stable(feature = "simd_x86", since = "1.27.0")]
1940pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
1941    cmpsd(a, b, 4)
1942}
1943
1944/// Returns a new vector with the low element of `a` replaced by the
1945/// not-less-than comparison of the lower elements of `a` and `b`.
1946///
1947/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
1948#[inline]
1949#[target_feature(enable = "sse2")]
1950#[cfg_attr(test, assert_instr(cmpnltsd))]
1951#[stable(feature = "simd_x86", since = "1.27.0")]
1952pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
1953    cmpsd(a, b, 5)
1954}
1955
1956/// Returns a new vector with the low element of `a` replaced by the
1957/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
1958///
1959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
1960#[inline]
1961#[target_feature(enable = "sse2")]
1962#[cfg_attr(test, assert_instr(cmpnlesd))]
1963#[stable(feature = "simd_x86", since = "1.27.0")]
1964pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
1965    cmpsd(a, b, 6)
1966}
1967
1968/// Returns a new vector with the low element of `a` replaced by the
1969/// not-greater-than comparison of the lower elements of `a` and `b`.
1970///
1971/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
1972#[inline]
1973#[target_feature(enable = "sse2")]
1974#[cfg_attr(test, assert_instr(cmpnltsd))]
1975#[stable(feature = "simd_x86", since = "1.27.0")]
1976pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
1977    simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64))
1978}
1979
1980/// Returns a new vector with the low element of `a` replaced by the
1981/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
1984#[inline]
1985#[target_feature(enable = "sse2")]
1986#[cfg_attr(test, assert_instr(cmpnlesd))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
1989    simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64))
1990}
1991
1992/// Compares corresponding elements in `a` and `b` for equality.
1993///
1994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
1995#[inline]
1996#[target_feature(enable = "sse2")]
1997#[cfg_attr(test, assert_instr(cmpeqpd))]
1998#[stable(feature = "simd_x86", since = "1.27.0")]
1999pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2000    cmppd(a, b, 0)
2001}
2002
2003/// Compares corresponding elements in `a` and `b` for less-than.
2004///
2005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2006#[inline]
2007#[target_feature(enable = "sse2")]
2008#[cfg_attr(test, assert_instr(cmpltpd))]
2009#[stable(feature = "simd_x86", since = "1.27.0")]
2010pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2011    cmppd(a, b, 1)
2012}
2013
2014/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2015///
2016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2017#[inline]
2018#[target_feature(enable = "sse2")]
2019#[cfg_attr(test, assert_instr(cmplepd))]
2020#[stable(feature = "simd_x86", since = "1.27.0")]
2021pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2022    cmppd(a, b, 2)
2023}
2024
2025/// Compares corresponding elements in `a` and `b` for greater-than.
2026///
2027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2028#[inline]
2029#[target_feature(enable = "sse2")]
2030#[cfg_attr(test, assert_instr(cmpltpd))]
2031#[stable(feature = "simd_x86", since = "1.27.0")]
2032pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2033    _mm_cmplt_pd(b, a)
2034}
2035
2036/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2037///
2038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2039#[inline]
2040#[target_feature(enable = "sse2")]
2041#[cfg_attr(test, assert_instr(cmplepd))]
2042#[stable(feature = "simd_x86", since = "1.27.0")]
2043pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2044    _mm_cmple_pd(b, a)
2045}
2046
2047/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2048///
2049/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2050#[inline]
2051#[target_feature(enable = "sse2")]
2052#[cfg_attr(test, assert_instr(cmpordpd))]
2053#[stable(feature = "simd_x86", since = "1.27.0")]
2054pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2055    cmppd(a, b, 7)
2056}
2057
2058/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2059///
2060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2061#[inline]
2062#[target_feature(enable = "sse2")]
2063#[cfg_attr(test, assert_instr(cmpunordpd))]
2064#[stable(feature = "simd_x86", since = "1.27.0")]
2065pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2066    cmppd(a, b, 3)
2067}
2068
2069/// Compares corresponding elements in `a` and `b` for not-equal.
2070///
2071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2072#[inline]
2073#[target_feature(enable = "sse2")]
2074#[cfg_attr(test, assert_instr(cmpneqpd))]
2075#[stable(feature = "simd_x86", since = "1.27.0")]
2076pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2077    cmppd(a, b, 4)
2078}
2079
2080/// Compares corresponding elements in `a` and `b` for not-less-than.
2081///
2082/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2083#[inline]
2084#[target_feature(enable = "sse2")]
2085#[cfg_attr(test, assert_instr(cmpnltpd))]
2086#[stable(feature = "simd_x86", since = "1.27.0")]
2087pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2088    cmppd(a, b, 5)
2089}
2090
2091/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2092///
2093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2094#[inline]
2095#[target_feature(enable = "sse2")]
2096#[cfg_attr(test, assert_instr(cmpnlepd))]
2097#[stable(feature = "simd_x86", since = "1.27.0")]
2098pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2099    cmppd(a, b, 6)
2100}
2101
2102/// Compares corresponding elements in `a` and `b` for not-greater-than.
2103///
2104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2105#[inline]
2106#[target_feature(enable = "sse2")]
2107#[cfg_attr(test, assert_instr(cmpnltpd))]
2108#[stable(feature = "simd_x86", since = "1.27.0")]
2109pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2110    _mm_cmpnlt_pd(b, a)
2111}
2112
2113/// Compares corresponding elements in `a` and `b` for
2114/// not-greater-than-or-equal.
2115///
2116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2117#[inline]
2118#[target_feature(enable = "sse2")]
2119#[cfg_attr(test, assert_instr(cmpnlepd))]
2120#[stable(feature = "simd_x86", since = "1.27.0")]
2121pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2122    _mm_cmpnle_pd(b, a)
2123}
2124
2125/// Compares the lower element of `a` and `b` for equality.
2126///
2127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2128#[inline]
2129#[target_feature(enable = "sse2")]
2130#[cfg_attr(test, assert_instr(comisd))]
2131#[stable(feature = "simd_x86", since = "1.27.0")]
2132pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2133    comieqsd(a, b)
2134}
2135
2136/// Compares the lower element of `a` and `b` for less-than.
2137///
2138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2139#[inline]
2140#[target_feature(enable = "sse2")]
2141#[cfg_attr(test, assert_instr(comisd))]
2142#[stable(feature = "simd_x86", since = "1.27.0")]
2143pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2144    comiltsd(a, b)
2145}
2146
2147/// Compares the lower element of `a` and `b` for less-than-or-equal.
2148///
2149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2150#[inline]
2151#[target_feature(enable = "sse2")]
2152#[cfg_attr(test, assert_instr(comisd))]
2153#[stable(feature = "simd_x86", since = "1.27.0")]
2154pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2155    comilesd(a, b)
2156}
2157
2158/// Compares the lower element of `a` and `b` for greater-than.
2159///
2160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2161#[inline]
2162#[target_feature(enable = "sse2")]
2163#[cfg_attr(test, assert_instr(comisd))]
2164#[stable(feature = "simd_x86", since = "1.27.0")]
2165pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2166    comigtsd(a, b)
2167}
2168
2169/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2170///
2171/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2172#[inline]
2173#[target_feature(enable = "sse2")]
2174#[cfg_attr(test, assert_instr(comisd))]
2175#[stable(feature = "simd_x86", since = "1.27.0")]
2176pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2177    comigesd(a, b)
2178}
2179
2180/// Compares the lower element of `a` and `b` for not-equal.
2181///
2182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2183#[inline]
2184#[target_feature(enable = "sse2")]
2185#[cfg_attr(test, assert_instr(comisd))]
2186#[stable(feature = "simd_x86", since = "1.27.0")]
2187pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2188    comineqsd(a, b)
2189}
2190
2191/// Compares the lower element of `a` and `b` for equality.
2192///
2193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2194#[inline]
2195#[target_feature(enable = "sse2")]
2196#[cfg_attr(test, assert_instr(ucomisd))]
2197#[stable(feature = "simd_x86", since = "1.27.0")]
2198pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2199    ucomieqsd(a, b)
2200}
2201
2202/// Compares the lower element of `a` and `b` for less-than.
2203///
2204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2205#[inline]
2206#[target_feature(enable = "sse2")]
2207#[cfg_attr(test, assert_instr(ucomisd))]
2208#[stable(feature = "simd_x86", since = "1.27.0")]
2209pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2210    ucomiltsd(a, b)
2211}
2212
2213/// Compares the lower element of `a` and `b` for less-than-or-equal.
2214///
2215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2216#[inline]
2217#[target_feature(enable = "sse2")]
2218#[cfg_attr(test, assert_instr(ucomisd))]
2219#[stable(feature = "simd_x86", since = "1.27.0")]
2220pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2221    ucomilesd(a, b)
2222}
2223
2224/// Compares the lower element of `a` and `b` for greater-than.
2225///
2226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2227#[inline]
2228#[target_feature(enable = "sse2")]
2229#[cfg_attr(test, assert_instr(ucomisd))]
2230#[stable(feature = "simd_x86", since = "1.27.0")]
2231pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2232    ucomigtsd(a, b)
2233}
2234
2235/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2236///
2237/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2238#[inline]
2239#[target_feature(enable = "sse2")]
2240#[cfg_attr(test, assert_instr(ucomisd))]
2241#[stable(feature = "simd_x86", since = "1.27.0")]
2242pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2243    ucomigesd(a, b)
2244}
2245
2246/// Compares the lower element of `a` and `b` for not-equal.
2247///
2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2249#[inline]
2250#[target_feature(enable = "sse2")]
2251#[cfg_attr(test, assert_instr(ucomisd))]
2252#[stable(feature = "simd_x86", since = "1.27.0")]
2253pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2254    ucomineqsd(a, b)
2255}
2256
2257/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2258/// packed single-precision (32-bit) floating-point elements
2259///
2260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2261#[inline]
2262#[target_feature(enable = "sse2")]
2263#[cfg_attr(test, assert_instr(cvtpd2ps))]
2264#[stable(feature = "simd_x86", since = "1.27.0")]
2265pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2266    let r = simd_cast::<_, f32x2>(a.as_f64x2());
2267    let zero = f32x2::ZERO;
2268    transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2269}
2270
2271/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2272/// packed
2273/// double-precision (64-bit) floating-point elements.
2274///
2275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2276#[inline]
2277#[target_feature(enable = "sse2")]
2278#[cfg_attr(test, assert_instr(cvtps2pd))]
2279#[stable(feature = "simd_x86", since = "1.27.0")]
2280pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
2281    let a = a.as_f32x4();
2282    transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2283}
2284
2285/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2286/// packed 32-bit integers.
2287///
2288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2289#[inline]
2290#[target_feature(enable = "sse2")]
2291#[cfg_attr(test, assert_instr(cvtpd2dq))]
2292#[stable(feature = "simd_x86", since = "1.27.0")]
2293pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2294    transmute(cvtpd2dq(a))
2295}
2296
2297/// Converts the lower double-precision (64-bit) floating-point element in a to
2298/// a 32-bit integer.
2299///
2300/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2301#[inline]
2302#[target_feature(enable = "sse2")]
2303#[cfg_attr(test, assert_instr(cvtsd2si))]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2306    cvtsd2si(a)
2307}
2308
2309/// Converts the lower double-precision (64-bit) floating-point element in `b`
2310/// to a single-precision (32-bit) floating-point element, store the result in
2311/// the lower element of the return value, and copies the upper element from `a`
2312/// to the upper element the return value.
2313///
2314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2315#[inline]
2316#[target_feature(enable = "sse2")]
2317#[cfg_attr(test, assert_instr(cvtsd2ss))]
2318#[stable(feature = "simd_x86", since = "1.27.0")]
2319pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2320    cvtsd2ss(a, b)
2321}
2322
2323/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2324///
2325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2326#[inline]
2327#[target_feature(enable = "sse2")]
2328#[stable(feature = "simd_x86", since = "1.27.0")]
2329pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2330    simd_extract!(a, 0)
2331}
2332
2333/// Converts the lower single-precision (32-bit) floating-point element in `b`
2334/// to a double-precision (64-bit) floating-point element, store the result in
2335/// the lower element of the return value, and copies the upper element from `a`
2336/// to the upper element the return value.
2337///
2338/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2339#[inline]
2340#[target_feature(enable = "sse2")]
2341#[cfg_attr(test, assert_instr(cvtss2sd))]
2342#[stable(feature = "simd_x86", since = "1.27.0")]
2343pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2344    cvtss2sd(a, b)
2345}
2346
2347/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2348/// packed 32-bit integers with truncation.
2349///
2350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2351#[inline]
2352#[target_feature(enable = "sse2")]
2353#[cfg_attr(test, assert_instr(cvttpd2dq))]
2354#[stable(feature = "simd_x86", since = "1.27.0")]
2355pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2356    transmute(cvttpd2dq(a))
2357}
2358
2359/// Converts the lower double-precision (64-bit) floating-point element in `a`
2360/// to a 32-bit integer with truncation.
2361///
2362/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2363#[inline]
2364#[target_feature(enable = "sse2")]
2365#[cfg_attr(test, assert_instr(cvttsd2si))]
2366#[stable(feature = "simd_x86", since = "1.27.0")]
2367pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2368    cvttsd2si(a)
2369}
2370
2371/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2372/// packed 32-bit integers with truncation.
2373///
2374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2375#[inline]
2376#[target_feature(enable = "sse2")]
2377#[cfg_attr(test, assert_instr(cvttps2dq))]
2378#[stable(feature = "simd_x86", since = "1.27.0")]
2379pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2380    transmute(cvttps2dq(a))
2381}
2382
2383/// Copies double-precision (64-bit) floating-point element `a` to the lower
2384/// element of the packed 64-bit return value.
2385///
2386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2387#[inline]
2388#[target_feature(enable = "sse2")]
2389#[stable(feature = "simd_x86", since = "1.27.0")]
2390pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
2391    _mm_set_pd(0.0, a)
2392}
2393
2394/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2395/// of the return value.
2396///
2397/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2398#[inline]
2399#[target_feature(enable = "sse2")]
2400#[stable(feature = "simd_x86", since = "1.27.0")]
2401pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
2402    _mm_set_pd(a, a)
2403}
2404
2405/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2406/// of the return value.
2407///
2408/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2409#[inline]
2410#[target_feature(enable = "sse2")]
2411#[stable(feature = "simd_x86", since = "1.27.0")]
2412pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
2413    _mm_set_pd(a, a)
2414}
2415
2416/// Sets packed double-precision (64-bit) floating-point elements in the return
2417/// value with the supplied values.
2418///
2419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2420#[inline]
2421#[target_feature(enable = "sse2")]
2422#[stable(feature = "simd_x86", since = "1.27.0")]
2423pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2424    __m128d([b, a])
2425}
2426
2427/// Sets packed double-precision (64-bit) floating-point elements in the return
2428/// value with the supplied values in reverse order.
2429///
2430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2431#[inline]
2432#[target_feature(enable = "sse2")]
2433#[stable(feature = "simd_x86", since = "1.27.0")]
2434pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2435    _mm_set_pd(b, a)
2436}
2437
2438/// Returns packed double-precision (64-bit) floating-point elements with all
2439/// zeros.
2440///
2441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2442#[inline]
2443#[target_feature(enable = "sse2")]
2444#[cfg_attr(test, assert_instr(xorp))]
2445#[stable(feature = "simd_x86", since = "1.27.0")]
2446pub unsafe fn _mm_setzero_pd() -> __m128d {
2447    const { mem::zeroed() }
2448}
2449
2450/// Returns a mask of the most significant bit of each element in `a`.
2451///
2452/// The mask is stored in the 2 least significant bits of the return value.
2453/// All other bits are set to `0`.
2454///
2455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2456#[inline]
2457#[target_feature(enable = "sse2")]
2458#[cfg_attr(test, assert_instr(movmskpd))]
2459#[stable(feature = "simd_x86", since = "1.27.0")]
2460pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
2461    // Propagate the highest bit to the rest, because simd_bitmask
2462    // requires all-1 or all-0.
2463    let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2464    simd_bitmask::<i64x2, u8>(mask).into()
2465}
2466
2467/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2468/// floating-point elements) from memory into the returned vector.
2469/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2470/// exception may be generated.
2471///
2472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2473#[inline]
2474#[target_feature(enable = "sse2")]
2475#[cfg_attr(test, assert_instr(movaps))]
2476#[stable(feature = "simd_x86", since = "1.27.0")]
2477#[allow(clippy::cast_ptr_alignment)]
2478pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2479    *(mem_addr as *const __m128d)
2480}
2481
2482/// Loads a 64-bit double-precision value to the low element of a
2483/// 128-bit integer vector and clears the upper element.
2484///
2485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2486#[inline]
2487#[target_feature(enable = "sse2")]
2488#[cfg_attr(test, assert_instr(movsd))]
2489#[stable(feature = "simd_x86", since = "1.27.0")]
2490pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2491    _mm_setr_pd(*mem_addr, 0.)
2492}
2493
2494/// Loads a double-precision value into the high-order bits of a 128-bit
2495/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2496/// bits of the first operand.
2497///
2498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2499#[inline]
2500#[target_feature(enable = "sse2")]
2501#[cfg_attr(test, assert_instr(movhps))]
2502#[stable(feature = "simd_x86", since = "1.27.0")]
2503pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2504    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2505}
2506
2507/// Loads a double-precision value into the low-order bits of a 128-bit
2508/// vector of `[2 x double]`. The high-order bits are copied from the
2509/// high-order bits of the first operand.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2512#[inline]
2513#[target_feature(enable = "sse2")]
2514#[cfg_attr(test, assert_instr(movlps))]
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2517    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2518}
2519
2520/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2521/// aligned memory location.
2522/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2523/// used again soon).
2524///
2525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2526///
2527/// # Safety of non-temporal stores
2528///
2529/// After using this intrinsic, but before any other access to the memory that this intrinsic
2530/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2531/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2532/// return.
2533///
2534/// See [`_mm_sfence`] for details.
2535#[inline]
2536#[target_feature(enable = "sse2")]
2537#[cfg_attr(test, assert_instr(movntpd))]
2538#[stable(feature = "simd_x86", since = "1.27.0")]
2539#[allow(clippy::cast_ptr_alignment)]
2540pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2541    crate::arch::asm!(
2542        vps!("movntpd", ",{a}"),
2543        p = in(reg) mem_addr,
2544        a = in(xmm_reg) a,
2545        options(nostack, preserves_flags),
2546    );
2547}
2548
2549/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2550/// memory location.
2551///
2552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2553#[inline]
2554#[target_feature(enable = "sse2")]
2555#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2556#[stable(feature = "simd_x86", since = "1.27.0")]
2557pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2558    *mem_addr = simd_extract!(a, 0)
2559}
2560
2561/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2562/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2563/// on a 16-byte boundary or a general-protection exception may be generated.
2564///
2565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2566#[inline]
2567#[target_feature(enable = "sse2")]
2568#[cfg_attr(test, assert_instr(movaps))]
2569#[stable(feature = "simd_x86", since = "1.27.0")]
2570#[allow(clippy::cast_ptr_alignment)]
2571pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2572    *(mem_addr as *mut __m128d) = a;
2573}
2574
2575/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2576/// floating-point elements) from `a` into memory.
2577/// `mem_addr` does not need to be aligned on any particular boundary.
2578///
2579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2580#[inline]
2581#[target_feature(enable = "sse2")]
2582#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2583#[stable(feature = "simd_x86", since = "1.27.0")]
2584pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2585    mem_addr.cast::<__m128d>().write_unaligned(a);
2586}
2587
2588/// Store 16-bit integer from the first element of a into memory.
2589///
2590/// `mem_addr` does not need to be aligned on any particular boundary.
2591///
2592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2593#[inline]
2594#[target_feature(enable = "sse2")]
2595#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2596pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2597    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2598}
2599
2600/// Store 32-bit integer from the first element of a into memory.
2601///
2602/// `mem_addr` does not need to be aligned on any particular boundary.
2603///
2604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2605#[inline]
2606#[target_feature(enable = "sse2")]
2607#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2608pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2609    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2610}
2611
2612/// Store 64-bit integer from the first element of a into memory.
2613///
2614/// `mem_addr` does not need to be aligned on any particular boundary.
2615///
2616/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2617#[inline]
2618#[target_feature(enable = "sse2")]
2619#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2620pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2621    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2622}
2623
2624/// Stores the lower double-precision (64-bit) floating-point element from `a`
2625/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2626/// 16-byte boundary or a general-protection exception may be generated.
2627///
2628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2629#[inline]
2630#[target_feature(enable = "sse2")]
2631#[stable(feature = "simd_x86", since = "1.27.0")]
2632#[allow(clippy::cast_ptr_alignment)]
2633pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2634    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2635    *(mem_addr as *mut __m128d) = b;
2636}
2637
2638/// Stores the lower double-precision (64-bit) floating-point element from `a`
2639/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2640/// 16-byte boundary or a general-protection exception may be generated.
2641///
2642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2643#[inline]
2644#[target_feature(enable = "sse2")]
2645#[stable(feature = "simd_x86", since = "1.27.0")]
2646#[allow(clippy::cast_ptr_alignment)]
2647pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2648    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2649    *(mem_addr as *mut __m128d) = b;
2650}
2651
2652/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2653/// memory in reverse order.
2654/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2655/// exception may be generated.
2656///
2657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2658#[inline]
2659#[target_feature(enable = "sse2")]
2660#[stable(feature = "simd_x86", since = "1.27.0")]
2661#[allow(clippy::cast_ptr_alignment)]
2662pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2663    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2664    *(mem_addr as *mut __m128d) = b;
2665}
2666
2667/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2668/// memory location.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2671#[inline]
2672#[target_feature(enable = "sse2")]
2673#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movhps))]
2674#[stable(feature = "simd_x86", since = "1.27.0")]
2675pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2676    *mem_addr = simd_extract!(a, 1);
2677}
2678
2679/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2680/// memory location.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2686#[stable(feature = "simd_x86", since = "1.27.0")]
2687pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2688    *mem_addr = simd_extract!(a, 0);
2689}
2690
2691/// Loads a double-precision (64-bit) floating-point element from memory
2692/// into both elements of returned vector.
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2695#[inline]
2696#[target_feature(enable = "sse2")]
2697// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2698#[stable(feature = "simd_x86", since = "1.27.0")]
2699pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2700    let d = *mem_addr;
2701    _mm_setr_pd(d, d)
2702}
2703
2704/// Loads a double-precision (64-bit) floating-point element from memory
2705/// into both elements of returned vector.
2706///
2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2708#[inline]
2709#[target_feature(enable = "sse2")]
2710// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2713    _mm_load1_pd(mem_addr)
2714}
2715
2716/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2717/// the returned vector in reverse order. `mem_addr` must be aligned on a
2718/// 16-byte boundary or a general-protection exception may be generated.
2719///
2720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2721#[inline]
2722#[target_feature(enable = "sse2")]
2723#[cfg_attr(test, assert_instr(movaps))]
2724#[stable(feature = "simd_x86", since = "1.27.0")]
2725pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2726    let a = _mm_load_pd(mem_addr);
2727    simd_shuffle!(a, a, [1, 0])
2728}
2729
2730/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2731/// floating-point elements) from memory into the returned vector.
2732/// `mem_addr` does not need to be aligned on any particular boundary.
2733///
2734/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2735#[inline]
2736#[target_feature(enable = "sse2")]
2737#[cfg_attr(test, assert_instr(movups))]
2738#[stable(feature = "simd_x86", since = "1.27.0")]
2739pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2740    let mut dst = _mm_undefined_pd();
2741    ptr::copy_nonoverlapping(
2742        mem_addr as *const u8,
2743        ptr::addr_of_mut!(dst) as *mut u8,
2744        mem::size_of::<__m128d>(),
2745    );
2746    dst
2747}
2748
2749/// Loads unaligned 16-bits of integer data from memory into new vector.
2750///
2751/// `mem_addr` does not need to be aligned on any particular boundary.
2752///
2753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2754#[inline]
2755#[target_feature(enable = "sse2")]
2756#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2757pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2758    transmute(i16x8::new(
2759        ptr::read_unaligned(mem_addr as *const i16),
2760        0,
2761        0,
2762        0,
2763        0,
2764        0,
2765        0,
2766        0,
2767    ))
2768}
2769
2770/// Loads unaligned 32-bits of integer data from memory into new vector.
2771///
2772/// `mem_addr` does not need to be aligned on any particular boundary.
2773///
2774/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2775#[inline]
2776#[target_feature(enable = "sse2")]
2777#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2778pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2779    transmute(i32x4::new(
2780        ptr::read_unaligned(mem_addr as *const i32),
2781        0,
2782        0,
2783        0,
2784    ))
2785}
2786
2787/// Loads unaligned 64-bits of integer data from memory into new vector.
2788///
2789/// `mem_addr` does not need to be aligned on any particular boundary.
2790///
2791/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2792#[inline]
2793#[target_feature(enable = "sse2")]
2794#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2795pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2796    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2797}
2798
2799/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2800/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2801/// parameter as a specifier.
2802///
2803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2804#[inline]
2805#[target_feature(enable = "sse2")]
2806#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2807#[rustc_legacy_const_generics(2)]
2808#[stable(feature = "simd_x86", since = "1.27.0")]
2809pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2810    static_assert_uimm_bits!(MASK, 8);
2811    simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2])
2812}
2813
2814/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2815/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2816/// 64 bits are set to the upper 64 bits of the first parameter.
2817///
2818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2819#[inline]
2820#[target_feature(enable = "sse2")]
2821#[cfg_attr(test, assert_instr(movsd))]
2822#[stable(feature = "simd_x86", since = "1.27.0")]
2823pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2824    _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1))
2825}
2826
2827/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2828/// floating-point vector of `[4 x float]`.
2829///
2830/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2831#[inline]
2832#[target_feature(enable = "sse2")]
2833#[stable(feature = "simd_x86", since = "1.27.0")]
2834pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
2835    transmute(a)
2836}
2837
2838/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2839/// integer vector.
2840///
2841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2842#[inline]
2843#[target_feature(enable = "sse2")]
2844#[stable(feature = "simd_x86", since = "1.27.0")]
2845pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
2846    transmute(a)
2847}
2848
2849/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2850/// floating-point vector of `[2 x double]`.
2851///
2852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2853#[inline]
2854#[target_feature(enable = "sse2")]
2855#[stable(feature = "simd_x86", since = "1.27.0")]
2856pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
2857    transmute(a)
2858}
2859
2860/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2861/// integer vector.
2862///
2863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2864#[inline]
2865#[target_feature(enable = "sse2")]
2866#[stable(feature = "simd_x86", since = "1.27.0")]
2867pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
2868    transmute(a)
2869}
2870
2871/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2872/// of `[2 x double]`.
2873///
2874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2875#[inline]
2876#[target_feature(enable = "sse2")]
2877#[stable(feature = "simd_x86", since = "1.27.0")]
2878pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2879    transmute(a)
2880}
2881
2882/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2883/// of `[4 x float]`.
2884///
2885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2886#[inline]
2887#[target_feature(enable = "sse2")]
2888#[stable(feature = "simd_x86", since = "1.27.0")]
2889pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2890    transmute(a)
2891}
2892
2893/// Returns vector of type __m128d with indeterminate elements.
2894/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2895/// In practice, this is equivalent to [`mem::zeroed`].
2896///
2897/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2898#[inline]
2899#[target_feature(enable = "sse2")]
2900#[stable(feature = "simd_x86", since = "1.27.0")]
2901pub unsafe fn _mm_undefined_pd() -> __m128d {
2902    const { mem::zeroed() }
2903}
2904
2905/// Returns vector of type __m128i with indeterminate elements.
2906/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2907/// In practice, this is equivalent to [`mem::zeroed`].
2908///
2909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
2910#[inline]
2911#[target_feature(enable = "sse2")]
2912#[stable(feature = "simd_x86", since = "1.27.0")]
2913pub unsafe fn _mm_undefined_si128() -> __m128i {
2914    const { mem::zeroed() }
2915}
2916
2917/// The resulting `__m128d` element is composed by the low-order values of
2918/// the two `__m128d` interleaved input elements, i.e.:
2919///
2920/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
2921/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
2922///
2923/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
2924#[inline]
2925#[target_feature(enable = "sse2")]
2926#[cfg_attr(test, assert_instr(unpckhpd))]
2927#[stable(feature = "simd_x86", since = "1.27.0")]
2928pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
2929    simd_shuffle!(a, b, [1, 3])
2930}
2931
2932/// The resulting `__m128d` element is composed by the high-order values of
2933/// the two `__m128d` interleaved input elements, i.e.:
2934///
2935/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
2936/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
2937///
2938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
2939#[inline]
2940#[target_feature(enable = "sse2")]
2941#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
2942#[stable(feature = "simd_x86", since = "1.27.0")]
2943pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
2944    simd_shuffle!(a, b, [0, 2])
2945}
2946
2947#[allow(improper_ctypes)]
2948extern "C" {
2949    #[link_name = "llvm.x86.sse2.pause"]
2950    fn pause();
2951    #[link_name = "llvm.x86.sse2.clflush"]
2952    fn clflush(p: *const u8);
2953    #[link_name = "llvm.x86.sse2.lfence"]
2954    fn lfence();
2955    #[link_name = "llvm.x86.sse2.mfence"]
2956    fn mfence();
2957    #[link_name = "llvm.x86.sse2.pmadd.wd"]
2958    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
2959    #[link_name = "llvm.x86.sse2.psad.bw"]
2960    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
2961    #[link_name = "llvm.x86.sse2.psll.w"]
2962    fn psllw(a: i16x8, count: i16x8) -> i16x8;
2963    #[link_name = "llvm.x86.sse2.psll.d"]
2964    fn pslld(a: i32x4, count: i32x4) -> i32x4;
2965    #[link_name = "llvm.x86.sse2.psll.q"]
2966    fn psllq(a: i64x2, count: i64x2) -> i64x2;
2967    #[link_name = "llvm.x86.sse2.psra.w"]
2968    fn psraw(a: i16x8, count: i16x8) -> i16x8;
2969    #[link_name = "llvm.x86.sse2.psra.d"]
2970    fn psrad(a: i32x4, count: i32x4) -> i32x4;
2971    #[link_name = "llvm.x86.sse2.psrl.w"]
2972    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
2973    #[link_name = "llvm.x86.sse2.psrl.d"]
2974    fn psrld(a: i32x4, count: i32x4) -> i32x4;
2975    #[link_name = "llvm.x86.sse2.psrl.q"]
2976    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
2977    #[link_name = "llvm.x86.sse2.cvtps2dq"]
2978    fn cvtps2dq(a: __m128) -> i32x4;
2979    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
2980    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
2981    #[link_name = "llvm.x86.sse2.packsswb.128"]
2982    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
2983    #[link_name = "llvm.x86.sse2.packssdw.128"]
2984    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
2985    #[link_name = "llvm.x86.sse2.packuswb.128"]
2986    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
2987    #[link_name = "llvm.x86.sse2.max.sd"]
2988    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
2989    #[link_name = "llvm.x86.sse2.max.pd"]
2990    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
2991    #[link_name = "llvm.x86.sse2.min.sd"]
2992    fn minsd(a: __m128d, b: __m128d) -> __m128d;
2993    #[link_name = "llvm.x86.sse2.min.pd"]
2994    fn minpd(a: __m128d, b: __m128d) -> __m128d;
2995    #[link_name = "llvm.x86.sse2.cmp.sd"]
2996    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
2997    #[link_name = "llvm.x86.sse2.cmp.pd"]
2998    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
2999    #[link_name = "llvm.x86.sse2.comieq.sd"]
3000    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3001    #[link_name = "llvm.x86.sse2.comilt.sd"]
3002    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3003    #[link_name = "llvm.x86.sse2.comile.sd"]
3004    fn comilesd(a: __m128d, b: __m128d) -> i32;
3005    #[link_name = "llvm.x86.sse2.comigt.sd"]
3006    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3007    #[link_name = "llvm.x86.sse2.comige.sd"]
3008    fn comigesd(a: __m128d, b: __m128d) -> i32;
3009    #[link_name = "llvm.x86.sse2.comineq.sd"]
3010    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3011    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3012    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3013    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3014    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3015    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3016    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3017    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3018    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3019    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3020    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3021    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3022    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3023    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3024    fn cvtpd2dq(a: __m128d) -> i32x4;
3025    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3026    fn cvtsd2si(a: __m128d) -> i32;
3027    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3028    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3029    #[link_name = "llvm.x86.sse2.cvtss2sd"]
3030    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3031    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3032    fn cvttpd2dq(a: __m128d) -> i32x4;
3033    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3034    fn cvttsd2si(a: __m128d) -> i32;
3035    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3036    fn cvttps2dq(a: __m128) -> i32x4;
3037}
3038
3039#[cfg(test)]
3040mod tests {
3041    use crate::{
3042        core_arch::{simd::*, x86::*},
3043        hint::black_box,
3044    };
3045    use std::{
3046        boxed, f32, f64,
3047        mem::{self, transmute},
3048        ptr,
3049    };
3050    use stdarch_test::simd_test;
3051
3052    const NAN: f64 = f64::NAN;
3053
3054    #[test]
3055    fn test_mm_pause() {
3056        unsafe { _mm_pause() }
3057    }
3058
3059    #[simd_test(enable = "sse2")]
3060    unsafe fn test_mm_clflush() {
3061        let x = 0_u8;
3062        _mm_clflush(ptr::addr_of!(x));
3063    }
3064
3065    #[simd_test(enable = "sse2")]
3066    // Miri cannot support this until it is clear how it fits in the Rust memory model
3067    #[cfg_attr(miri, ignore)]
3068    unsafe fn test_mm_lfence() {
3069        _mm_lfence();
3070    }
3071
3072    #[simd_test(enable = "sse2")]
3073    // Miri cannot support this until it is clear how it fits in the Rust memory model
3074    #[cfg_attr(miri, ignore)]
3075    unsafe fn test_mm_mfence() {
3076        _mm_mfence();
3077    }
3078
3079    #[simd_test(enable = "sse2")]
3080    unsafe fn test_mm_add_epi8() {
3081        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3082        #[rustfmt::skip]
3083        let b = _mm_setr_epi8(
3084            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3085        );
3086        let r = _mm_add_epi8(a, b);
3087        #[rustfmt::skip]
3088        let e = _mm_setr_epi8(
3089            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3090        );
3091        assert_eq_m128i(r, e);
3092    }
3093
3094    #[simd_test(enable = "sse2")]
3095    unsafe fn test_mm_add_epi8_overflow() {
3096        let a = _mm_set1_epi8(0x7F);
3097        let b = _mm_set1_epi8(1);
3098        let r = _mm_add_epi8(a, b);
3099        assert_eq_m128i(r, _mm_set1_epi8(-128));
3100    }
3101
3102    #[simd_test(enable = "sse2")]
3103    unsafe fn test_mm_add_epi16() {
3104        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3105        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3106        let r = _mm_add_epi16(a, b);
3107        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3108        assert_eq_m128i(r, e);
3109    }
3110
3111    #[simd_test(enable = "sse2")]
3112    unsafe fn test_mm_add_epi32() {
3113        let a = _mm_setr_epi32(0, 1, 2, 3);
3114        let b = _mm_setr_epi32(4, 5, 6, 7);
3115        let r = _mm_add_epi32(a, b);
3116        let e = _mm_setr_epi32(4, 6, 8, 10);
3117        assert_eq_m128i(r, e);
3118    }
3119
3120    #[simd_test(enable = "sse2")]
3121    unsafe fn test_mm_add_epi64() {
3122        let a = _mm_setr_epi64x(0, 1);
3123        let b = _mm_setr_epi64x(2, 3);
3124        let r = _mm_add_epi64(a, b);
3125        let e = _mm_setr_epi64x(2, 4);
3126        assert_eq_m128i(r, e);
3127    }
3128
3129    #[simd_test(enable = "sse2")]
3130    unsafe fn test_mm_adds_epi8() {
3131        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3132        #[rustfmt::skip]
3133        let b = _mm_setr_epi8(
3134            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3135        );
3136        let r = _mm_adds_epi8(a, b);
3137        #[rustfmt::skip]
3138        let e = _mm_setr_epi8(
3139            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3140        );
3141        assert_eq_m128i(r, e);
3142    }
3143
3144    #[simd_test(enable = "sse2")]
3145    unsafe fn test_mm_adds_epi8_saturate_positive() {
3146        let a = _mm_set1_epi8(0x7F);
3147        let b = _mm_set1_epi8(1);
3148        let r = _mm_adds_epi8(a, b);
3149        assert_eq_m128i(r, a);
3150    }
3151
3152    #[simd_test(enable = "sse2")]
3153    unsafe fn test_mm_adds_epi8_saturate_negative() {
3154        let a = _mm_set1_epi8(-0x80);
3155        let b = _mm_set1_epi8(-1);
3156        let r = _mm_adds_epi8(a, b);
3157        assert_eq_m128i(r, a);
3158    }
3159
3160    #[simd_test(enable = "sse2")]
3161    unsafe fn test_mm_adds_epi16() {
3162        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3163        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3164        let r = _mm_adds_epi16(a, b);
3165        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3166        assert_eq_m128i(r, e);
3167    }
3168
3169    #[simd_test(enable = "sse2")]
3170    unsafe fn test_mm_adds_epi16_saturate_positive() {
3171        let a = _mm_set1_epi16(0x7FFF);
3172        let b = _mm_set1_epi16(1);
3173        let r = _mm_adds_epi16(a, b);
3174        assert_eq_m128i(r, a);
3175    }
3176
3177    #[simd_test(enable = "sse2")]
3178    unsafe fn test_mm_adds_epi16_saturate_negative() {
3179        let a = _mm_set1_epi16(-0x8000);
3180        let b = _mm_set1_epi16(-1);
3181        let r = _mm_adds_epi16(a, b);
3182        assert_eq_m128i(r, a);
3183    }
3184
3185    #[simd_test(enable = "sse2")]
3186    unsafe fn test_mm_adds_epu8() {
3187        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3188        #[rustfmt::skip]
3189        let b = _mm_setr_epi8(
3190            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3191        );
3192        let r = _mm_adds_epu8(a, b);
3193        #[rustfmt::skip]
3194        let e = _mm_setr_epi8(
3195            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3196        );
3197        assert_eq_m128i(r, e);
3198    }
3199
3200    #[simd_test(enable = "sse2")]
3201    unsafe fn test_mm_adds_epu8_saturate() {
3202        let a = _mm_set1_epi8(!0);
3203        let b = _mm_set1_epi8(1);
3204        let r = _mm_adds_epu8(a, b);
3205        assert_eq_m128i(r, a);
3206    }
3207
3208    #[simd_test(enable = "sse2")]
3209    unsafe fn test_mm_adds_epu16() {
3210        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3211        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3212        let r = _mm_adds_epu16(a, b);
3213        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3214        assert_eq_m128i(r, e);
3215    }
3216
3217    #[simd_test(enable = "sse2")]
3218    unsafe fn test_mm_adds_epu16_saturate() {
3219        let a = _mm_set1_epi16(!0);
3220        let b = _mm_set1_epi16(1);
3221        let r = _mm_adds_epu16(a, b);
3222        assert_eq_m128i(r, a);
3223    }
3224
3225    #[simd_test(enable = "sse2")]
3226    unsafe fn test_mm_avg_epu8() {
3227        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3228        let r = _mm_avg_epu8(a, b);
3229        assert_eq_m128i(r, _mm_set1_epi8(6));
3230    }
3231
3232    #[simd_test(enable = "sse2")]
3233    unsafe fn test_mm_avg_epu16() {
3234        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3235        let r = _mm_avg_epu16(a, b);
3236        assert_eq_m128i(r, _mm_set1_epi16(6));
3237    }
3238
3239    #[simd_test(enable = "sse2")]
3240    unsafe fn test_mm_madd_epi16() {
3241        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3242        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3243        let r = _mm_madd_epi16(a, b);
3244        let e = _mm_setr_epi32(29, 81, 149, 233);
3245        assert_eq_m128i(r, e);
3246
3247        // Test large values.
3248        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3249        let a = _mm_setr_epi16(
3250            i16::MAX,
3251            i16::MAX,
3252            i16::MIN,
3253            i16::MIN,
3254            i16::MIN,
3255            i16::MAX,
3256            0,
3257            0,
3258        );
3259        let b = _mm_setr_epi16(
3260            i16::MAX,
3261            i16::MAX,
3262            i16::MIN,
3263            i16::MIN,
3264            i16::MAX,
3265            i16::MIN,
3266            0,
3267            0,
3268        );
3269        let r = _mm_madd_epi16(a, b);
3270        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3271        assert_eq_m128i(r, e);
3272    }
3273
3274    #[simd_test(enable = "sse2")]
3275    unsafe fn test_mm_max_epi16() {
3276        let a = _mm_set1_epi16(1);
3277        let b = _mm_set1_epi16(-1);
3278        let r = _mm_max_epi16(a, b);
3279        assert_eq_m128i(r, a);
3280    }
3281
3282    #[simd_test(enable = "sse2")]
3283    unsafe fn test_mm_max_epu8() {
3284        let a = _mm_set1_epi8(1);
3285        let b = _mm_set1_epi8(!0);
3286        let r = _mm_max_epu8(a, b);
3287        assert_eq_m128i(r, b);
3288    }
3289
3290    #[simd_test(enable = "sse2")]
3291    unsafe fn test_mm_min_epi16() {
3292        let a = _mm_set1_epi16(1);
3293        let b = _mm_set1_epi16(-1);
3294        let r = _mm_min_epi16(a, b);
3295        assert_eq_m128i(r, b);
3296    }
3297
3298    #[simd_test(enable = "sse2")]
3299    unsafe fn test_mm_min_epu8() {
3300        let a = _mm_set1_epi8(1);
3301        let b = _mm_set1_epi8(!0);
3302        let r = _mm_min_epu8(a, b);
3303        assert_eq_m128i(r, a);
3304    }
3305
3306    #[simd_test(enable = "sse2")]
3307    unsafe fn test_mm_mulhi_epi16() {
3308        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3309        let r = _mm_mulhi_epi16(a, b);
3310        assert_eq_m128i(r, _mm_set1_epi16(-16));
3311    }
3312
3313    #[simd_test(enable = "sse2")]
3314    unsafe fn test_mm_mulhi_epu16() {
3315        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3316        let r = _mm_mulhi_epu16(a, b);
3317        assert_eq_m128i(r, _mm_set1_epi16(15));
3318    }
3319
3320    #[simd_test(enable = "sse2")]
3321    unsafe fn test_mm_mullo_epi16() {
3322        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3323        let r = _mm_mullo_epi16(a, b);
3324        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3325    }
3326
3327    #[simd_test(enable = "sse2")]
3328    unsafe fn test_mm_mul_epu32() {
3329        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3330        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3331        let r = _mm_mul_epu32(a, b);
3332        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3333        assert_eq_m128i(r, e);
3334    }
3335
3336    #[simd_test(enable = "sse2")]
3337    unsafe fn test_mm_sad_epu8() {
3338        #[rustfmt::skip]
3339        let a = _mm_setr_epi8(
3340            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3341            1, 2, 3, 4,
3342            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3343            1, 2, 3, 4,
3344        );
3345        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3346        let r = _mm_sad_epu8(a, b);
3347        let e = _mm_setr_epi64x(1020, 614);
3348        assert_eq_m128i(r, e);
3349    }
3350
3351    #[simd_test(enable = "sse2")]
3352    unsafe fn test_mm_sub_epi8() {
3353        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3354        let r = _mm_sub_epi8(a, b);
3355        assert_eq_m128i(r, _mm_set1_epi8(-1));
3356    }
3357
3358    #[simd_test(enable = "sse2")]
3359    unsafe fn test_mm_sub_epi16() {
3360        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3361        let r = _mm_sub_epi16(a, b);
3362        assert_eq_m128i(r, _mm_set1_epi16(-1));
3363    }
3364
3365    #[simd_test(enable = "sse2")]
3366    unsafe fn test_mm_sub_epi32() {
3367        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3368        let r = _mm_sub_epi32(a, b);
3369        assert_eq_m128i(r, _mm_set1_epi32(-1));
3370    }
3371
3372    #[simd_test(enable = "sse2")]
3373    unsafe fn test_mm_sub_epi64() {
3374        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3375        let r = _mm_sub_epi64(a, b);
3376        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3377    }
3378
3379    #[simd_test(enable = "sse2")]
3380    unsafe fn test_mm_subs_epi8() {
3381        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3382        let r = _mm_subs_epi8(a, b);
3383        assert_eq_m128i(r, _mm_set1_epi8(3));
3384    }
3385
3386    #[simd_test(enable = "sse2")]
3387    unsafe fn test_mm_subs_epi8_saturate_positive() {
3388        let a = _mm_set1_epi8(0x7F);
3389        let b = _mm_set1_epi8(-1);
3390        let r = _mm_subs_epi8(a, b);
3391        assert_eq_m128i(r, a);
3392    }
3393
3394    #[simd_test(enable = "sse2")]
3395    unsafe fn test_mm_subs_epi8_saturate_negative() {
3396        let a = _mm_set1_epi8(-0x80);
3397        let b = _mm_set1_epi8(1);
3398        let r = _mm_subs_epi8(a, b);
3399        assert_eq_m128i(r, a);
3400    }
3401
3402    #[simd_test(enable = "sse2")]
3403    unsafe fn test_mm_subs_epi16() {
3404        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3405        let r = _mm_subs_epi16(a, b);
3406        assert_eq_m128i(r, _mm_set1_epi16(3));
3407    }
3408
3409    #[simd_test(enable = "sse2")]
3410    unsafe fn test_mm_subs_epi16_saturate_positive() {
3411        let a = _mm_set1_epi16(0x7FFF);
3412        let b = _mm_set1_epi16(-1);
3413        let r = _mm_subs_epi16(a, b);
3414        assert_eq_m128i(r, a);
3415    }
3416
3417    #[simd_test(enable = "sse2")]
3418    unsafe fn test_mm_subs_epi16_saturate_negative() {
3419        let a = _mm_set1_epi16(-0x8000);
3420        let b = _mm_set1_epi16(1);
3421        let r = _mm_subs_epi16(a, b);
3422        assert_eq_m128i(r, a);
3423    }
3424
3425    #[simd_test(enable = "sse2")]
3426    unsafe fn test_mm_subs_epu8() {
3427        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3428        let r = _mm_subs_epu8(a, b);
3429        assert_eq_m128i(r, _mm_set1_epi8(3));
3430    }
3431
3432    #[simd_test(enable = "sse2")]
3433    unsafe fn test_mm_subs_epu8_saturate() {
3434        let a = _mm_set1_epi8(0);
3435        let b = _mm_set1_epi8(1);
3436        let r = _mm_subs_epu8(a, b);
3437        assert_eq_m128i(r, a);
3438    }
3439
3440    #[simd_test(enable = "sse2")]
3441    unsafe fn test_mm_subs_epu16() {
3442        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3443        let r = _mm_subs_epu16(a, b);
3444        assert_eq_m128i(r, _mm_set1_epi16(3));
3445    }
3446
3447    #[simd_test(enable = "sse2")]
3448    unsafe fn test_mm_subs_epu16_saturate() {
3449        let a = _mm_set1_epi16(0);
3450        let b = _mm_set1_epi16(1);
3451        let r = _mm_subs_epu16(a, b);
3452        assert_eq_m128i(r, a);
3453    }
3454
3455    #[simd_test(enable = "sse2")]
3456    unsafe fn test_mm_slli_si128() {
3457        #[rustfmt::skip]
3458        let a = _mm_setr_epi8(
3459            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3460        );
3461        let r = _mm_slli_si128::<1>(a);
3462        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3463        assert_eq_m128i(r, e);
3464
3465        #[rustfmt::skip]
3466        let a = _mm_setr_epi8(
3467            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3468        );
3469        let r = _mm_slli_si128::<15>(a);
3470        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3471        assert_eq_m128i(r, e);
3472
3473        #[rustfmt::skip]
3474        let a = _mm_setr_epi8(
3475            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3476        );
3477        let r = _mm_slli_si128::<16>(a);
3478        assert_eq_m128i(r, _mm_set1_epi8(0));
3479    }
3480
3481    #[simd_test(enable = "sse2")]
3482    unsafe fn test_mm_slli_epi16() {
3483        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3484        let r = _mm_slli_epi16::<4>(a);
3485        assert_eq_m128i(
3486            r,
3487            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3488        );
3489        let r = _mm_slli_epi16::<16>(a);
3490        assert_eq_m128i(r, _mm_set1_epi16(0));
3491    }
3492
3493    #[simd_test(enable = "sse2")]
3494    unsafe fn test_mm_sll_epi16() {
3495        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3496        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3497        assert_eq_m128i(
3498            r,
3499            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3500        );
3501        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3502        assert_eq_m128i(r, a);
3503        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3504        assert_eq_m128i(r, _mm_set1_epi16(0));
3505        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3506        assert_eq_m128i(r, _mm_set1_epi16(0));
3507    }
3508
3509    #[simd_test(enable = "sse2")]
3510    unsafe fn test_mm_slli_epi32() {
3511        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3512        let r = _mm_slli_epi32::<4>(a);
3513        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3514        let r = _mm_slli_epi32::<32>(a);
3515        assert_eq_m128i(r, _mm_set1_epi32(0));
3516    }
3517
3518    #[simd_test(enable = "sse2")]
3519    unsafe fn test_mm_sll_epi32() {
3520        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3521        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3522        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3523        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3524        assert_eq_m128i(r, a);
3525        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3526        assert_eq_m128i(r, _mm_set1_epi32(0));
3527        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3528        assert_eq_m128i(r, _mm_set1_epi32(0));
3529    }
3530
3531    #[simd_test(enable = "sse2")]
3532    unsafe fn test_mm_slli_epi64() {
3533        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3534        let r = _mm_slli_epi64::<4>(a);
3535        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3536        let r = _mm_slli_epi64::<64>(a);
3537        assert_eq_m128i(r, _mm_set1_epi64x(0));
3538    }
3539
3540    #[simd_test(enable = "sse2")]
3541    unsafe fn test_mm_sll_epi64() {
3542        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3543        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3544        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3545        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3546        assert_eq_m128i(r, a);
3547        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3548        assert_eq_m128i(r, _mm_set1_epi64x(0));
3549        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3550        assert_eq_m128i(r, _mm_set1_epi64x(0));
3551    }
3552
3553    #[simd_test(enable = "sse2")]
3554    unsafe fn test_mm_srai_epi16() {
3555        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3556        let r = _mm_srai_epi16::<4>(a);
3557        assert_eq_m128i(
3558            r,
3559            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3560        );
3561        let r = _mm_srai_epi16::<16>(a);
3562        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3563    }
3564
3565    #[simd_test(enable = "sse2")]
3566    unsafe fn test_mm_sra_epi16() {
3567        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3568        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3569        assert_eq_m128i(
3570            r,
3571            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3572        );
3573        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3574        assert_eq_m128i(r, a);
3575        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3576        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3577        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3578        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3579    }
3580
3581    #[simd_test(enable = "sse2")]
3582    unsafe fn test_mm_srai_epi32() {
3583        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3584        let r = _mm_srai_epi32::<4>(a);
3585        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3586        let r = _mm_srai_epi32::<32>(a);
3587        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3588    }
3589
3590    #[simd_test(enable = "sse2")]
3591    unsafe fn test_mm_sra_epi32() {
3592        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3593        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3594        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3595        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3596        assert_eq_m128i(r, a);
3597        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3598        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3599        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3600        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3601    }
3602
3603    #[simd_test(enable = "sse2")]
3604    unsafe fn test_mm_srli_si128() {
3605        #[rustfmt::skip]
3606        let a = _mm_setr_epi8(
3607            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3608        );
3609        let r = _mm_srli_si128::<1>(a);
3610        #[rustfmt::skip]
3611        let e = _mm_setr_epi8(
3612            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3613        );
3614        assert_eq_m128i(r, e);
3615
3616        #[rustfmt::skip]
3617        let a = _mm_setr_epi8(
3618            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3619        );
3620        let r = _mm_srli_si128::<15>(a);
3621        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3622        assert_eq_m128i(r, e);
3623
3624        #[rustfmt::skip]
3625        let a = _mm_setr_epi8(
3626            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3627        );
3628        let r = _mm_srli_si128::<16>(a);
3629        assert_eq_m128i(r, _mm_set1_epi8(0));
3630    }
3631
3632    #[simd_test(enable = "sse2")]
3633    unsafe fn test_mm_srli_epi16() {
3634        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3635        let r = _mm_srli_epi16::<4>(a);
3636        assert_eq_m128i(
3637            r,
3638            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3639        );
3640        let r = _mm_srli_epi16::<16>(a);
3641        assert_eq_m128i(r, _mm_set1_epi16(0));
3642    }
3643
3644    #[simd_test(enable = "sse2")]
3645    unsafe fn test_mm_srl_epi16() {
3646        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3647        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3648        assert_eq_m128i(
3649            r,
3650            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3651        );
3652        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3653        assert_eq_m128i(r, a);
3654        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3655        assert_eq_m128i(r, _mm_set1_epi16(0));
3656        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3657        assert_eq_m128i(r, _mm_set1_epi16(0));
3658    }
3659
3660    #[simd_test(enable = "sse2")]
3661    unsafe fn test_mm_srli_epi32() {
3662        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3663        let r = _mm_srli_epi32::<4>(a);
3664        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3665        let r = _mm_srli_epi32::<32>(a);
3666        assert_eq_m128i(r, _mm_set1_epi32(0));
3667    }
3668
3669    #[simd_test(enable = "sse2")]
3670    unsafe fn test_mm_srl_epi32() {
3671        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3672        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3673        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3674        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3675        assert_eq_m128i(r, a);
3676        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3677        assert_eq_m128i(r, _mm_set1_epi32(0));
3678        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3679        assert_eq_m128i(r, _mm_set1_epi32(0));
3680    }
3681
3682    #[simd_test(enable = "sse2")]
3683    unsafe fn test_mm_srli_epi64() {
3684        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3685        let r = _mm_srli_epi64::<4>(a);
3686        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3687        let r = _mm_srli_epi64::<64>(a);
3688        assert_eq_m128i(r, _mm_set1_epi64x(0));
3689    }
3690
3691    #[simd_test(enable = "sse2")]
3692    unsafe fn test_mm_srl_epi64() {
3693        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3694        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3695        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3696        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3697        assert_eq_m128i(r, a);
3698        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3699        assert_eq_m128i(r, _mm_set1_epi64x(0));
3700        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3701        assert_eq_m128i(r, _mm_set1_epi64x(0));
3702    }
3703
3704    #[simd_test(enable = "sse2")]
3705    unsafe fn test_mm_and_si128() {
3706        let a = _mm_set1_epi8(5);
3707        let b = _mm_set1_epi8(3);
3708        let r = _mm_and_si128(a, b);
3709        assert_eq_m128i(r, _mm_set1_epi8(1));
3710    }
3711
3712    #[simd_test(enable = "sse2")]
3713    unsafe fn test_mm_andnot_si128() {
3714        let a = _mm_set1_epi8(5);
3715        let b = _mm_set1_epi8(3);
3716        let r = _mm_andnot_si128(a, b);
3717        assert_eq_m128i(r, _mm_set1_epi8(2));
3718    }
3719
3720    #[simd_test(enable = "sse2")]
3721    unsafe fn test_mm_or_si128() {
3722        let a = _mm_set1_epi8(5);
3723        let b = _mm_set1_epi8(3);
3724        let r = _mm_or_si128(a, b);
3725        assert_eq_m128i(r, _mm_set1_epi8(7));
3726    }
3727
3728    #[simd_test(enable = "sse2")]
3729    unsafe fn test_mm_xor_si128() {
3730        let a = _mm_set1_epi8(5);
3731        let b = _mm_set1_epi8(3);
3732        let r = _mm_xor_si128(a, b);
3733        assert_eq_m128i(r, _mm_set1_epi8(6));
3734    }
3735
3736    #[simd_test(enable = "sse2")]
3737    unsafe fn test_mm_cmpeq_epi8() {
3738        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3739        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3740        let r = _mm_cmpeq_epi8(a, b);
3741        #[rustfmt::skip]
3742        assert_eq_m128i(
3743            r,
3744            _mm_setr_epi8(
3745                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3746            )
3747        );
3748    }
3749
3750    #[simd_test(enable = "sse2")]
3751    unsafe fn test_mm_cmpeq_epi16() {
3752        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3753        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3754        let r = _mm_cmpeq_epi16(a, b);
3755        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3756    }
3757
3758    #[simd_test(enable = "sse2")]
3759    unsafe fn test_mm_cmpeq_epi32() {
3760        let a = _mm_setr_epi32(0, 1, 2, 3);
3761        let b = _mm_setr_epi32(3, 2, 2, 0);
3762        let r = _mm_cmpeq_epi32(a, b);
3763        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3764    }
3765
3766    #[simd_test(enable = "sse2")]
3767    unsafe fn test_mm_cmpgt_epi8() {
3768        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3769        let b = _mm_set1_epi8(0);
3770        let r = _mm_cmpgt_epi8(a, b);
3771        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3772        assert_eq_m128i(r, e);
3773    }
3774
3775    #[simd_test(enable = "sse2")]
3776    unsafe fn test_mm_cmpgt_epi16() {
3777        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3778        let b = _mm_set1_epi16(0);
3779        let r = _mm_cmpgt_epi16(a, b);
3780        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3781        assert_eq_m128i(r, e);
3782    }
3783
3784    #[simd_test(enable = "sse2")]
3785    unsafe fn test_mm_cmpgt_epi32() {
3786        let a = _mm_set_epi32(5, 0, 0, 0);
3787        let b = _mm_set1_epi32(0);
3788        let r = _mm_cmpgt_epi32(a, b);
3789        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3790    }
3791
3792    #[simd_test(enable = "sse2")]
3793    unsafe fn test_mm_cmplt_epi8() {
3794        let a = _mm_set1_epi8(0);
3795        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3796        let r = _mm_cmplt_epi8(a, b);
3797        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3798        assert_eq_m128i(r, e);
3799    }
3800
3801    #[simd_test(enable = "sse2")]
3802    unsafe fn test_mm_cmplt_epi16() {
3803        let a = _mm_set1_epi16(0);
3804        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3805        let r = _mm_cmplt_epi16(a, b);
3806        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3807        assert_eq_m128i(r, e);
3808    }
3809
3810    #[simd_test(enable = "sse2")]
3811    unsafe fn test_mm_cmplt_epi32() {
3812        let a = _mm_set1_epi32(0);
3813        let b = _mm_set_epi32(5, 0, 0, 0);
3814        let r = _mm_cmplt_epi32(a, b);
3815        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3816    }
3817
3818    #[simd_test(enable = "sse2")]
3819    unsafe fn test_mm_cvtepi32_pd() {
3820        let a = _mm_set_epi32(35, 25, 15, 5);
3821        let r = _mm_cvtepi32_pd(a);
3822        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3823    }
3824
3825    #[simd_test(enable = "sse2")]
3826    unsafe fn test_mm_cvtsi32_sd() {
3827        let a = _mm_set1_pd(3.5);
3828        let r = _mm_cvtsi32_sd(a, 5);
3829        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3830    }
3831
3832    #[simd_test(enable = "sse2")]
3833    unsafe fn test_mm_cvtepi32_ps() {
3834        let a = _mm_setr_epi32(1, 2, 3, 4);
3835        let r = _mm_cvtepi32_ps(a);
3836        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3837    }
3838
3839    #[simd_test(enable = "sse2")]
3840    unsafe fn test_mm_cvtps_epi32() {
3841        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3842        let r = _mm_cvtps_epi32(a);
3843        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3844    }
3845
3846    #[simd_test(enable = "sse2")]
3847    unsafe fn test_mm_cvtsi32_si128() {
3848        let r = _mm_cvtsi32_si128(5);
3849        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3850    }
3851
3852    #[simd_test(enable = "sse2")]
3853    unsafe fn test_mm_cvtsi128_si32() {
3854        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3855        assert_eq!(r, 5);
3856    }
3857
3858    #[simd_test(enable = "sse2")]
3859    unsafe fn test_mm_set_epi64x() {
3860        let r = _mm_set_epi64x(0, 1);
3861        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3862    }
3863
3864    #[simd_test(enable = "sse2")]
3865    unsafe fn test_mm_set_epi32() {
3866        let r = _mm_set_epi32(0, 1, 2, 3);
3867        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3868    }
3869
3870    #[simd_test(enable = "sse2")]
3871    unsafe fn test_mm_set_epi16() {
3872        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3873        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3874    }
3875
3876    #[simd_test(enable = "sse2")]
3877    unsafe fn test_mm_set_epi8() {
3878        #[rustfmt::skip]
3879        let r = _mm_set_epi8(
3880            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3881        );
3882        #[rustfmt::skip]
3883        let e = _mm_setr_epi8(
3884            15, 14, 13, 12, 11, 10, 9, 8,
3885            7, 6, 5, 4, 3, 2, 1, 0,
3886        );
3887        assert_eq_m128i(r, e);
3888    }
3889
3890    #[simd_test(enable = "sse2")]
3891    unsafe fn test_mm_set1_epi64x() {
3892        let r = _mm_set1_epi64x(1);
3893        assert_eq_m128i(r, _mm_set1_epi64x(1));
3894    }
3895
3896    #[simd_test(enable = "sse2")]
3897    unsafe fn test_mm_set1_epi32() {
3898        let r = _mm_set1_epi32(1);
3899        assert_eq_m128i(r, _mm_set1_epi32(1));
3900    }
3901
3902    #[simd_test(enable = "sse2")]
3903    unsafe fn test_mm_set1_epi16() {
3904        let r = _mm_set1_epi16(1);
3905        assert_eq_m128i(r, _mm_set1_epi16(1));
3906    }
3907
3908    #[simd_test(enable = "sse2")]
3909    unsafe fn test_mm_set1_epi8() {
3910        let r = _mm_set1_epi8(1);
3911        assert_eq_m128i(r, _mm_set1_epi8(1));
3912    }
3913
3914    #[simd_test(enable = "sse2")]
3915    unsafe fn test_mm_setr_epi32() {
3916        let r = _mm_setr_epi32(0, 1, 2, 3);
3917        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
3918    }
3919
3920    #[simd_test(enable = "sse2")]
3921    unsafe fn test_mm_setr_epi16() {
3922        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3923        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
3924    }
3925
3926    #[simd_test(enable = "sse2")]
3927    unsafe fn test_mm_setr_epi8() {
3928        #[rustfmt::skip]
3929        let r = _mm_setr_epi8(
3930            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3931        );
3932        #[rustfmt::skip]
3933        let e = _mm_setr_epi8(
3934            0, 1, 2, 3, 4, 5, 6, 7,
3935            8, 9, 10, 11, 12, 13, 14, 15,
3936        );
3937        assert_eq_m128i(r, e);
3938    }
3939
3940    #[simd_test(enable = "sse2")]
3941    unsafe fn test_mm_setzero_si128() {
3942        let r = _mm_setzero_si128();
3943        assert_eq_m128i(r, _mm_set1_epi64x(0));
3944    }
3945
3946    #[simd_test(enable = "sse2")]
3947    unsafe fn test_mm_loadl_epi64() {
3948        let a = _mm_setr_epi64x(6, 5);
3949        let r = _mm_loadl_epi64(ptr::addr_of!(a));
3950        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
3951    }
3952
3953    #[simd_test(enable = "sse2")]
3954    unsafe fn test_mm_load_si128() {
3955        let a = _mm_set_epi64x(5, 6);
3956        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
3957        assert_eq_m128i(a, r);
3958    }
3959
3960    #[simd_test(enable = "sse2")]
3961    unsafe fn test_mm_loadu_si128() {
3962        let a = _mm_set_epi64x(5, 6);
3963        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
3964        assert_eq_m128i(a, r);
3965    }
3966
3967    #[simd_test(enable = "sse2")]
3968    // Miri cannot support this until it is clear how it fits in the Rust memory model
3969    // (non-temporal store)
3970    #[cfg_attr(miri, ignore)]
3971    unsafe fn test_mm_maskmoveu_si128() {
3972        let a = _mm_set1_epi8(9);
3973        #[rustfmt::skip]
3974        let mask = _mm_set_epi8(
3975            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
3976            0, 0, 0, 0, 0, 0, 0, 0,
3977        );
3978        let mut r = _mm_set1_epi8(0);
3979        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
3980        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3981        assert_eq_m128i(r, e);
3982    }
3983
3984    #[simd_test(enable = "sse2")]
3985    unsafe fn test_mm_store_si128() {
3986        let a = _mm_set1_epi8(9);
3987        let mut r = _mm_set1_epi8(0);
3988        _mm_store_si128(&mut r, a);
3989        assert_eq_m128i(r, a);
3990    }
3991
3992    #[simd_test(enable = "sse2")]
3993    unsafe fn test_mm_storeu_si128() {
3994        let a = _mm_set1_epi8(9);
3995        let mut r = _mm_set1_epi8(0);
3996        _mm_storeu_si128(&mut r, a);
3997        assert_eq_m128i(r, a);
3998    }
3999
4000    #[simd_test(enable = "sse2")]
4001    unsafe fn test_mm_storel_epi64() {
4002        let a = _mm_setr_epi64x(2, 9);
4003        let mut r = _mm_set1_epi8(0);
4004        _mm_storel_epi64(&mut r, a);
4005        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4006    }
4007
4008    #[simd_test(enable = "sse2")]
4009    // Miri cannot support this until it is clear how it fits in the Rust memory model
4010    // (non-temporal store)
4011    #[cfg_attr(miri, ignore)]
4012    unsafe fn test_mm_stream_si128() {
4013        let a = _mm_setr_epi32(1, 2, 3, 4);
4014        let mut r = _mm_undefined_si128();
4015        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4016        assert_eq_m128i(r, a);
4017    }
4018
4019    #[simd_test(enable = "sse2")]
4020    // Miri cannot support this until it is clear how it fits in the Rust memory model
4021    // (non-temporal store)
4022    #[cfg_attr(miri, ignore)]
4023    unsafe fn test_mm_stream_si32() {
4024        let a: i32 = 7;
4025        let mut mem = boxed::Box::<i32>::new(-1);
4026        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4027        assert_eq!(a, *mem);
4028    }
4029
4030    #[simd_test(enable = "sse2")]
4031    unsafe fn test_mm_move_epi64() {
4032        let a = _mm_setr_epi64x(5, 6);
4033        let r = _mm_move_epi64(a);
4034        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4035    }
4036
4037    #[simd_test(enable = "sse2")]
4038    unsafe fn test_mm_packs_epi16() {
4039        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4040        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4041        let r = _mm_packs_epi16(a, b);
4042        #[rustfmt::skip]
4043        assert_eq_m128i(
4044            r,
4045            _mm_setr_epi8(
4046                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4047            )
4048        );
4049    }
4050
4051    #[simd_test(enable = "sse2")]
4052    unsafe fn test_mm_packs_epi32() {
4053        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4054        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4055        let r = _mm_packs_epi32(a, b);
4056        assert_eq_m128i(
4057            r,
4058            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4059        );
4060    }
4061
4062    #[simd_test(enable = "sse2")]
4063    unsafe fn test_mm_packus_epi16() {
4064        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4065        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4066        let r = _mm_packus_epi16(a, b);
4067        assert_eq_m128i(
4068            r,
4069            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4070        );
4071    }
4072
4073    #[simd_test(enable = "sse2")]
4074    unsafe fn test_mm_extract_epi16() {
4075        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4076        let r1 = _mm_extract_epi16::<0>(a);
4077        let r2 = _mm_extract_epi16::<3>(a);
4078        assert_eq!(r1, 0xFFFF);
4079        assert_eq!(r2, 3);
4080    }
4081
4082    #[simd_test(enable = "sse2")]
4083    unsafe fn test_mm_insert_epi16() {
4084        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4085        let r = _mm_insert_epi16::<0>(a, 9);
4086        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4087        assert_eq_m128i(r, e);
4088    }
4089
4090    #[simd_test(enable = "sse2")]
4091    unsafe fn test_mm_movemask_epi8() {
4092        #[rustfmt::skip]
4093        let a = _mm_setr_epi8(
4094            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4095            0b0101, 0b1111_0000u8 as i8, 0, 0,
4096            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4097            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4098        );
4099        let r = _mm_movemask_epi8(a);
4100        assert_eq!(r, 0b10100110_00100101);
4101    }
4102
4103    #[simd_test(enable = "sse2")]
4104    unsafe fn test_mm_shuffle_epi32() {
4105        let a = _mm_setr_epi32(5, 10, 15, 20);
4106        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4107        let e = _mm_setr_epi32(20, 10, 10, 5);
4108        assert_eq_m128i(r, e);
4109    }
4110
4111    #[simd_test(enable = "sse2")]
4112    unsafe fn test_mm_shufflehi_epi16() {
4113        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4114        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4115        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4116        assert_eq_m128i(r, e);
4117    }
4118
4119    #[simd_test(enable = "sse2")]
4120    unsafe fn test_mm_shufflelo_epi16() {
4121        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4122        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4123        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4124        assert_eq_m128i(r, e);
4125    }
4126
4127    #[simd_test(enable = "sse2")]
4128    unsafe fn test_mm_unpackhi_epi8() {
4129        #[rustfmt::skip]
4130        let a = _mm_setr_epi8(
4131            0, 1, 2, 3, 4, 5, 6, 7,
4132            8, 9, 10, 11, 12, 13, 14, 15,
4133        );
4134        #[rustfmt::skip]
4135        let b = _mm_setr_epi8(
4136            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4137        );
4138        let r = _mm_unpackhi_epi8(a, b);
4139        #[rustfmt::skip]
4140        let e = _mm_setr_epi8(
4141            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4142        );
4143        assert_eq_m128i(r, e);
4144    }
4145
4146    #[simd_test(enable = "sse2")]
4147    unsafe fn test_mm_unpackhi_epi16() {
4148        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4149        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4150        let r = _mm_unpackhi_epi16(a, b);
4151        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4152        assert_eq_m128i(r, e);
4153    }
4154
4155    #[simd_test(enable = "sse2")]
4156    unsafe fn test_mm_unpackhi_epi32() {
4157        let a = _mm_setr_epi32(0, 1, 2, 3);
4158        let b = _mm_setr_epi32(4, 5, 6, 7);
4159        let r = _mm_unpackhi_epi32(a, b);
4160        let e = _mm_setr_epi32(2, 6, 3, 7);
4161        assert_eq_m128i(r, e);
4162    }
4163
4164    #[simd_test(enable = "sse2")]
4165    unsafe fn test_mm_unpackhi_epi64() {
4166        let a = _mm_setr_epi64x(0, 1);
4167        let b = _mm_setr_epi64x(2, 3);
4168        let r = _mm_unpackhi_epi64(a, b);
4169        let e = _mm_setr_epi64x(1, 3);
4170        assert_eq_m128i(r, e);
4171    }
4172
4173    #[simd_test(enable = "sse2")]
4174    unsafe fn test_mm_unpacklo_epi8() {
4175        #[rustfmt::skip]
4176        let a = _mm_setr_epi8(
4177            0, 1, 2, 3, 4, 5, 6, 7,
4178            8, 9, 10, 11, 12, 13, 14, 15,
4179        );
4180        #[rustfmt::skip]
4181        let b = _mm_setr_epi8(
4182            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4183        );
4184        let r = _mm_unpacklo_epi8(a, b);
4185        #[rustfmt::skip]
4186        let e = _mm_setr_epi8(
4187            0, 16, 1, 17, 2, 18, 3, 19,
4188            4, 20, 5, 21, 6, 22, 7, 23,
4189        );
4190        assert_eq_m128i(r, e);
4191    }
4192
4193    #[simd_test(enable = "sse2")]
4194    unsafe fn test_mm_unpacklo_epi16() {
4195        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4196        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4197        let r = _mm_unpacklo_epi16(a, b);
4198        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4199        assert_eq_m128i(r, e);
4200    }
4201
4202    #[simd_test(enable = "sse2")]
4203    unsafe fn test_mm_unpacklo_epi32() {
4204        let a = _mm_setr_epi32(0, 1, 2, 3);
4205        let b = _mm_setr_epi32(4, 5, 6, 7);
4206        let r = _mm_unpacklo_epi32(a, b);
4207        let e = _mm_setr_epi32(0, 4, 1, 5);
4208        assert_eq_m128i(r, e);
4209    }
4210
4211    #[simd_test(enable = "sse2")]
4212    unsafe fn test_mm_unpacklo_epi64() {
4213        let a = _mm_setr_epi64x(0, 1);
4214        let b = _mm_setr_epi64x(2, 3);
4215        let r = _mm_unpacklo_epi64(a, b);
4216        let e = _mm_setr_epi64x(0, 2);
4217        assert_eq_m128i(r, e);
4218    }
4219
4220    #[simd_test(enable = "sse2")]
4221    unsafe fn test_mm_add_sd() {
4222        let a = _mm_setr_pd(1.0, 2.0);
4223        let b = _mm_setr_pd(5.0, 10.0);
4224        let r = _mm_add_sd(a, b);
4225        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4226    }
4227
4228    #[simd_test(enable = "sse2")]
4229    unsafe fn test_mm_add_pd() {
4230        let a = _mm_setr_pd(1.0, 2.0);
4231        let b = _mm_setr_pd(5.0, 10.0);
4232        let r = _mm_add_pd(a, b);
4233        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4234    }
4235
4236    #[simd_test(enable = "sse2")]
4237    unsafe fn test_mm_div_sd() {
4238        let a = _mm_setr_pd(1.0, 2.0);
4239        let b = _mm_setr_pd(5.0, 10.0);
4240        let r = _mm_div_sd(a, b);
4241        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4242    }
4243
4244    #[simd_test(enable = "sse2")]
4245    unsafe fn test_mm_div_pd() {
4246        let a = _mm_setr_pd(1.0, 2.0);
4247        let b = _mm_setr_pd(5.0, 10.0);
4248        let r = _mm_div_pd(a, b);
4249        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4250    }
4251
4252    #[simd_test(enable = "sse2")]
4253    unsafe fn test_mm_max_sd() {
4254        let a = _mm_setr_pd(1.0, 2.0);
4255        let b = _mm_setr_pd(5.0, 10.0);
4256        let r = _mm_max_sd(a, b);
4257        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4258    }
4259
4260    #[simd_test(enable = "sse2")]
4261    unsafe fn test_mm_max_pd() {
4262        let a = _mm_setr_pd(1.0, 2.0);
4263        let b = _mm_setr_pd(5.0, 10.0);
4264        let r = _mm_max_pd(a, b);
4265        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4266
4267        // Check SSE(2)-specific semantics for -0.0 handling.
4268        let a = _mm_setr_pd(-0.0, 0.0);
4269        let b = _mm_setr_pd(0.0, 0.0);
4270        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4271        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4272        let a: [u8; 16] = transmute(a);
4273        let b: [u8; 16] = transmute(b);
4274        assert_eq!(r1, b);
4275        assert_eq!(r2, a);
4276        assert_ne!(a, b); // sanity check that -0.0 is actually present
4277    }
4278
4279    #[simd_test(enable = "sse2")]
4280    unsafe fn test_mm_min_sd() {
4281        let a = _mm_setr_pd(1.0, 2.0);
4282        let b = _mm_setr_pd(5.0, 10.0);
4283        let r = _mm_min_sd(a, b);
4284        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4285    }
4286
4287    #[simd_test(enable = "sse2")]
4288    unsafe fn test_mm_min_pd() {
4289        let a = _mm_setr_pd(1.0, 2.0);
4290        let b = _mm_setr_pd(5.0, 10.0);
4291        let r = _mm_min_pd(a, b);
4292        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4293
4294        // Check SSE(2)-specific semantics for -0.0 handling.
4295        let a = _mm_setr_pd(-0.0, 0.0);
4296        let b = _mm_setr_pd(0.0, 0.0);
4297        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4298        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4299        let a: [u8; 16] = transmute(a);
4300        let b: [u8; 16] = transmute(b);
4301        assert_eq!(r1, b);
4302        assert_eq!(r2, a);
4303        assert_ne!(a, b); // sanity check that -0.0 is actually present
4304    }
4305
4306    #[simd_test(enable = "sse2")]
4307    unsafe fn test_mm_mul_sd() {
4308        let a = _mm_setr_pd(1.0, 2.0);
4309        let b = _mm_setr_pd(5.0, 10.0);
4310        let r = _mm_mul_sd(a, b);
4311        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4312    }
4313
4314    #[simd_test(enable = "sse2")]
4315    unsafe fn test_mm_mul_pd() {
4316        let a = _mm_setr_pd(1.0, 2.0);
4317        let b = _mm_setr_pd(5.0, 10.0);
4318        let r = _mm_mul_pd(a, b);
4319        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4320    }
4321
4322    #[simd_test(enable = "sse2")]
4323    unsafe fn test_mm_sqrt_sd() {
4324        let a = _mm_setr_pd(1.0, 2.0);
4325        let b = _mm_setr_pd(5.0, 10.0);
4326        let r = _mm_sqrt_sd(a, b);
4327        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4328    }
4329
4330    #[simd_test(enable = "sse2")]
4331    unsafe fn test_mm_sqrt_pd() {
4332        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4333        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4334    }
4335
4336    #[simd_test(enable = "sse2")]
4337    unsafe fn test_mm_sub_sd() {
4338        let a = _mm_setr_pd(1.0, 2.0);
4339        let b = _mm_setr_pd(5.0, 10.0);
4340        let r = _mm_sub_sd(a, b);
4341        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4342    }
4343
4344    #[simd_test(enable = "sse2")]
4345    unsafe fn test_mm_sub_pd() {
4346        let a = _mm_setr_pd(1.0, 2.0);
4347        let b = _mm_setr_pd(5.0, 10.0);
4348        let r = _mm_sub_pd(a, b);
4349        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4350    }
4351
4352    #[simd_test(enable = "sse2")]
4353    unsafe fn test_mm_and_pd() {
4354        let a = transmute(u64x2::splat(5));
4355        let b = transmute(u64x2::splat(3));
4356        let r = _mm_and_pd(a, b);
4357        let e = transmute(u64x2::splat(1));
4358        assert_eq_m128d(r, e);
4359    }
4360
4361    #[simd_test(enable = "sse2")]
4362    unsafe fn test_mm_andnot_pd() {
4363        let a = transmute(u64x2::splat(5));
4364        let b = transmute(u64x2::splat(3));
4365        let r = _mm_andnot_pd(a, b);
4366        let e = transmute(u64x2::splat(2));
4367        assert_eq_m128d(r, e);
4368    }
4369
4370    #[simd_test(enable = "sse2")]
4371    unsafe fn test_mm_or_pd() {
4372        let a = transmute(u64x2::splat(5));
4373        let b = transmute(u64x2::splat(3));
4374        let r = _mm_or_pd(a, b);
4375        let e = transmute(u64x2::splat(7));
4376        assert_eq_m128d(r, e);
4377    }
4378
4379    #[simd_test(enable = "sse2")]
4380    unsafe fn test_mm_xor_pd() {
4381        let a = transmute(u64x2::splat(5));
4382        let b = transmute(u64x2::splat(3));
4383        let r = _mm_xor_pd(a, b);
4384        let e = transmute(u64x2::splat(6));
4385        assert_eq_m128d(r, e);
4386    }
4387
4388    #[simd_test(enable = "sse2")]
4389    unsafe fn test_mm_cmpeq_sd() {
4390        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4391        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4392        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4393        assert_eq_m128i(r, e);
4394    }
4395
4396    #[simd_test(enable = "sse2")]
4397    unsafe fn test_mm_cmplt_sd() {
4398        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4399        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4400        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4401        assert_eq_m128i(r, e);
4402    }
4403
4404    #[simd_test(enable = "sse2")]
4405    unsafe fn test_mm_cmple_sd() {
4406        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4407        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4408        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4409        assert_eq_m128i(r, e);
4410    }
4411
4412    #[simd_test(enable = "sse2")]
4413    unsafe fn test_mm_cmpgt_sd() {
4414        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4415        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4416        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4417        assert_eq_m128i(r, e);
4418    }
4419
4420    #[simd_test(enable = "sse2")]
4421    unsafe fn test_mm_cmpge_sd() {
4422        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4423        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4424        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4425        assert_eq_m128i(r, e);
4426    }
4427
4428    #[simd_test(enable = "sse2")]
4429    unsafe fn test_mm_cmpord_sd() {
4430        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4431        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4432        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4433        assert_eq_m128i(r, e);
4434    }
4435
4436    #[simd_test(enable = "sse2")]
4437    unsafe fn test_mm_cmpunord_sd() {
4438        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4439        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4440        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4441        assert_eq_m128i(r, e);
4442    }
4443
4444    #[simd_test(enable = "sse2")]
4445    unsafe fn test_mm_cmpneq_sd() {
4446        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4447        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4448        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4449        assert_eq_m128i(r, e);
4450    }
4451
4452    #[simd_test(enable = "sse2")]
4453    unsafe fn test_mm_cmpnlt_sd() {
4454        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4455        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4456        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4457        assert_eq_m128i(r, e);
4458    }
4459
4460    #[simd_test(enable = "sse2")]
4461    unsafe fn test_mm_cmpnle_sd() {
4462        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4463        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4464        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4465        assert_eq_m128i(r, e);
4466    }
4467
4468    #[simd_test(enable = "sse2")]
4469    unsafe fn test_mm_cmpngt_sd() {
4470        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4471        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4472        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4473        assert_eq_m128i(r, e);
4474    }
4475
4476    #[simd_test(enable = "sse2")]
4477    unsafe fn test_mm_cmpnge_sd() {
4478        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4479        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4480        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4481        assert_eq_m128i(r, e);
4482    }
4483
4484    #[simd_test(enable = "sse2")]
4485    unsafe fn test_mm_cmpeq_pd() {
4486        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4487        let e = _mm_setr_epi64x(!0, 0);
4488        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4489        assert_eq_m128i(r, e);
4490    }
4491
4492    #[simd_test(enable = "sse2")]
4493    unsafe fn test_mm_cmplt_pd() {
4494        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4495        let e = _mm_setr_epi64x(0, !0);
4496        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4497        assert_eq_m128i(r, e);
4498    }
4499
4500    #[simd_test(enable = "sse2")]
4501    unsafe fn test_mm_cmple_pd() {
4502        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4503        let e = _mm_setr_epi64x(!0, !0);
4504        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4505        assert_eq_m128i(r, e);
4506    }
4507
4508    #[simd_test(enable = "sse2")]
4509    unsafe fn test_mm_cmpgt_pd() {
4510        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4511        let e = _mm_setr_epi64x(0, 0);
4512        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4513        assert_eq_m128i(r, e);
4514    }
4515
4516    #[simd_test(enable = "sse2")]
4517    unsafe fn test_mm_cmpge_pd() {
4518        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4519        let e = _mm_setr_epi64x(!0, 0);
4520        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4521        assert_eq_m128i(r, e);
4522    }
4523
4524    #[simd_test(enable = "sse2")]
4525    unsafe fn test_mm_cmpord_pd() {
4526        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4527        let e = _mm_setr_epi64x(0, !0);
4528        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4529        assert_eq_m128i(r, e);
4530    }
4531
4532    #[simd_test(enable = "sse2")]
4533    unsafe fn test_mm_cmpunord_pd() {
4534        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4535        let e = _mm_setr_epi64x(!0, 0);
4536        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4537        assert_eq_m128i(r, e);
4538    }
4539
4540    #[simd_test(enable = "sse2")]
4541    unsafe fn test_mm_cmpneq_pd() {
4542        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4543        let e = _mm_setr_epi64x(!0, !0);
4544        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4545        assert_eq_m128i(r, e);
4546    }
4547
4548    #[simd_test(enable = "sse2")]
4549    unsafe fn test_mm_cmpnlt_pd() {
4550        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4551        let e = _mm_setr_epi64x(0, 0);
4552        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4553        assert_eq_m128i(r, e);
4554    }
4555
4556    #[simd_test(enable = "sse2")]
4557    unsafe fn test_mm_cmpnle_pd() {
4558        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4559        let e = _mm_setr_epi64x(0, 0);
4560        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4561        assert_eq_m128i(r, e);
4562    }
4563
4564    #[simd_test(enable = "sse2")]
4565    unsafe fn test_mm_cmpngt_pd() {
4566        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4567        let e = _mm_setr_epi64x(0, !0);
4568        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4569        assert_eq_m128i(r, e);
4570    }
4571
4572    #[simd_test(enable = "sse2")]
4573    unsafe fn test_mm_cmpnge_pd() {
4574        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4575        let e = _mm_setr_epi64x(0, !0);
4576        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4577        assert_eq_m128i(r, e);
4578    }
4579
4580    #[simd_test(enable = "sse2")]
4581    unsafe fn test_mm_comieq_sd() {
4582        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4583        assert!(_mm_comieq_sd(a, b) != 0);
4584
4585        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4586        assert!(_mm_comieq_sd(a, b) == 0);
4587    }
4588
4589    #[simd_test(enable = "sse2")]
4590    unsafe fn test_mm_comilt_sd() {
4591        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4592        assert!(_mm_comilt_sd(a, b) == 0);
4593    }
4594
4595    #[simd_test(enable = "sse2")]
4596    unsafe fn test_mm_comile_sd() {
4597        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4598        assert!(_mm_comile_sd(a, b) != 0);
4599    }
4600
4601    #[simd_test(enable = "sse2")]
4602    unsafe fn test_mm_comigt_sd() {
4603        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4604        assert!(_mm_comigt_sd(a, b) == 0);
4605    }
4606
4607    #[simd_test(enable = "sse2")]
4608    unsafe fn test_mm_comige_sd() {
4609        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4610        assert!(_mm_comige_sd(a, b) != 0);
4611    }
4612
4613    #[simd_test(enable = "sse2")]
4614    unsafe fn test_mm_comineq_sd() {
4615        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4616        assert!(_mm_comineq_sd(a, b) == 0);
4617    }
4618
4619    #[simd_test(enable = "sse2")]
4620    unsafe fn test_mm_ucomieq_sd() {
4621        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4622        assert!(_mm_ucomieq_sd(a, b) != 0);
4623
4624        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4625        assert!(_mm_ucomieq_sd(a, b) == 0);
4626    }
4627
4628    #[simd_test(enable = "sse2")]
4629    unsafe fn test_mm_ucomilt_sd() {
4630        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4631        assert!(_mm_ucomilt_sd(a, b) == 0);
4632    }
4633
4634    #[simd_test(enable = "sse2")]
4635    unsafe fn test_mm_ucomile_sd() {
4636        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4637        assert!(_mm_ucomile_sd(a, b) != 0);
4638    }
4639
4640    #[simd_test(enable = "sse2")]
4641    unsafe fn test_mm_ucomigt_sd() {
4642        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4643        assert!(_mm_ucomigt_sd(a, b) == 0);
4644    }
4645
4646    #[simd_test(enable = "sse2")]
4647    unsafe fn test_mm_ucomige_sd() {
4648        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4649        assert!(_mm_ucomige_sd(a, b) != 0);
4650    }
4651
4652    #[simd_test(enable = "sse2")]
4653    unsafe fn test_mm_ucomineq_sd() {
4654        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4655        assert!(_mm_ucomineq_sd(a, b) == 0);
4656    }
4657
4658    #[simd_test(enable = "sse2")]
4659    unsafe fn test_mm_movemask_pd() {
4660        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4661        assert_eq!(r, 0b01);
4662
4663        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4664        assert_eq!(r, 0b11);
4665    }
4666
4667    #[repr(align(16))]
4668    struct Memory {
4669        data: [f64; 4],
4670    }
4671
4672    #[simd_test(enable = "sse2")]
4673    unsafe fn test_mm_load_pd() {
4674        let mem = Memory {
4675            data: [1.0f64, 2.0, 3.0, 4.0],
4676        };
4677        let vals = &mem.data;
4678        let d = vals.as_ptr();
4679
4680        let r = _mm_load_pd(d);
4681        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4682    }
4683
4684    #[simd_test(enable = "sse2")]
4685    unsafe fn test_mm_load_sd() {
4686        let a = 1.;
4687        let expected = _mm_setr_pd(a, 0.);
4688        let r = _mm_load_sd(&a);
4689        assert_eq_m128d(r, expected);
4690    }
4691
4692    #[simd_test(enable = "sse2")]
4693    unsafe fn test_mm_loadh_pd() {
4694        let a = _mm_setr_pd(1., 2.);
4695        let b = 3.;
4696        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4697        let r = _mm_loadh_pd(a, &b);
4698        assert_eq_m128d(r, expected);
4699    }
4700
4701    #[simd_test(enable = "sse2")]
4702    unsafe fn test_mm_loadl_pd() {
4703        let a = _mm_setr_pd(1., 2.);
4704        let b = 3.;
4705        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4706        let r = _mm_loadl_pd(a, &b);
4707        assert_eq_m128d(r, expected);
4708    }
4709
4710    #[simd_test(enable = "sse2")]
4711    // Miri cannot support this until it is clear how it fits in the Rust memory model
4712    // (non-temporal store)
4713    #[cfg_attr(miri, ignore)]
4714    unsafe fn test_mm_stream_pd() {
4715        #[repr(align(128))]
4716        struct Memory {
4717            pub data: [f64; 2],
4718        }
4719        let a = _mm_set1_pd(7.0);
4720        let mut mem = Memory { data: [-1.0; 2] };
4721
4722        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4723        for i in 0..2 {
4724            assert_eq!(mem.data[i], get_m128d(a, i));
4725        }
4726    }
4727
4728    #[simd_test(enable = "sse2")]
4729    unsafe fn test_mm_store_sd() {
4730        let mut dest = 0.;
4731        let a = _mm_setr_pd(1., 2.);
4732        _mm_store_sd(&mut dest, a);
4733        assert_eq!(dest, _mm_cvtsd_f64(a));
4734    }
4735
4736    #[simd_test(enable = "sse2")]
4737    unsafe fn test_mm_store_pd() {
4738        let mut mem = Memory { data: [0.0f64; 4] };
4739        let vals = &mut mem.data;
4740        let a = _mm_setr_pd(1.0, 2.0);
4741        let d = vals.as_mut_ptr();
4742
4743        _mm_store_pd(d, *black_box(&a));
4744        assert_eq!(vals[0], 1.0);
4745        assert_eq!(vals[1], 2.0);
4746    }
4747
4748    #[simd_test(enable = "sse2")]
4749    unsafe fn test_mm_storeu_pd() {
4750        let mut mem = Memory { data: [0.0f64; 4] };
4751        let vals = &mut mem.data;
4752        let a = _mm_setr_pd(1.0, 2.0);
4753
4754        let mut ofs = 0;
4755        let mut p = vals.as_mut_ptr();
4756
4757        // Make sure p is **not** aligned to 16-byte boundary
4758        if (p as usize) & 0xf == 0 {
4759            ofs = 1;
4760            p = p.add(1);
4761        }
4762
4763        _mm_storeu_pd(p, *black_box(&a));
4764
4765        if ofs > 0 {
4766            assert_eq!(vals[ofs - 1], 0.0);
4767        }
4768        assert_eq!(vals[ofs + 0], 1.0);
4769        assert_eq!(vals[ofs + 1], 2.0);
4770    }
4771
4772    #[simd_test(enable = "sse2")]
4773    unsafe fn test_mm_storeu_si16() {
4774        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4775        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4776        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4777        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4778        assert_eq_m128i(r, e);
4779    }
4780
4781    #[simd_test(enable = "sse2")]
4782    unsafe fn test_mm_storeu_si32() {
4783        let a = _mm_setr_epi32(1, 2, 3, 4);
4784        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4785        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4786        let e = _mm_setr_epi32(1, 6, 7, 8);
4787        assert_eq_m128i(r, e);
4788    }
4789
4790    #[simd_test(enable = "sse2")]
4791    unsafe fn test_mm_storeu_si64() {
4792        let a = _mm_setr_epi64x(1, 2);
4793        let mut r = _mm_setr_epi64x(3, 4);
4794        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4795        let e = _mm_setr_epi64x(1, 4);
4796        assert_eq_m128i(r, e);
4797    }
4798
4799    #[simd_test(enable = "sse2")]
4800    unsafe fn test_mm_store1_pd() {
4801        let mut mem = Memory { data: [0.0f64; 4] };
4802        let vals = &mut mem.data;
4803        let a = _mm_setr_pd(1.0, 2.0);
4804        let d = vals.as_mut_ptr();
4805
4806        _mm_store1_pd(d, *black_box(&a));
4807        assert_eq!(vals[0], 1.0);
4808        assert_eq!(vals[1], 1.0);
4809    }
4810
4811    #[simd_test(enable = "sse2")]
4812    unsafe fn test_mm_store_pd1() {
4813        let mut mem = Memory { data: [0.0f64; 4] };
4814        let vals = &mut mem.data;
4815        let a = _mm_setr_pd(1.0, 2.0);
4816        let d = vals.as_mut_ptr();
4817
4818        _mm_store_pd1(d, *black_box(&a));
4819        assert_eq!(vals[0], 1.0);
4820        assert_eq!(vals[1], 1.0);
4821    }
4822
4823    #[simd_test(enable = "sse2")]
4824    unsafe fn test_mm_storer_pd() {
4825        let mut mem = Memory { data: [0.0f64; 4] };
4826        let vals = &mut mem.data;
4827        let a = _mm_setr_pd(1.0, 2.0);
4828        let d = vals.as_mut_ptr();
4829
4830        _mm_storer_pd(d, *black_box(&a));
4831        assert_eq!(vals[0], 2.0);
4832        assert_eq!(vals[1], 1.0);
4833    }
4834
4835    #[simd_test(enable = "sse2")]
4836    unsafe fn test_mm_storeh_pd() {
4837        let mut dest = 0.;
4838        let a = _mm_setr_pd(1., 2.);
4839        _mm_storeh_pd(&mut dest, a);
4840        assert_eq!(dest, get_m128d(a, 1));
4841    }
4842
4843    #[simd_test(enable = "sse2")]
4844    unsafe fn test_mm_storel_pd() {
4845        let mut dest = 0.;
4846        let a = _mm_setr_pd(1., 2.);
4847        _mm_storel_pd(&mut dest, a);
4848        assert_eq!(dest, _mm_cvtsd_f64(a));
4849    }
4850
4851    #[simd_test(enable = "sse2")]
4852    unsafe fn test_mm_loadr_pd() {
4853        let mut mem = Memory {
4854            data: [1.0f64, 2.0, 3.0, 4.0],
4855        };
4856        let vals = &mut mem.data;
4857        let d = vals.as_ptr();
4858
4859        let r = _mm_loadr_pd(d);
4860        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4861    }
4862
4863    #[simd_test(enable = "sse2")]
4864    unsafe fn test_mm_loadu_pd() {
4865        let mut mem = Memory {
4866            data: [1.0f64, 2.0, 3.0, 4.0],
4867        };
4868        let vals = &mut mem.data;
4869        let mut d = vals.as_ptr();
4870
4871        // make sure d is not aligned to 16-byte boundary
4872        let mut offset = 0;
4873        if (d as usize) & 0xf == 0 {
4874            offset = 1;
4875            d = d.add(offset);
4876        }
4877
4878        let r = _mm_loadu_pd(d);
4879        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4880        assert_eq_m128d(r, e);
4881    }
4882
4883    #[simd_test(enable = "sse2")]
4884    unsafe fn test_mm_loadu_si16() {
4885        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4886        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4887        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4888    }
4889
4890    #[simd_test(enable = "sse2")]
4891    unsafe fn test_mm_loadu_si32() {
4892        let a = _mm_setr_epi32(1, 2, 3, 4);
4893        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4894        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4895    }
4896
4897    #[simd_test(enable = "sse2")]
4898    unsafe fn test_mm_loadu_si64() {
4899        let a = _mm_setr_epi64x(5, 6);
4900        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
4901        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4902    }
4903
4904    #[simd_test(enable = "sse2")]
4905    unsafe fn test_mm_cvtpd_ps() {
4906        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4907        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4908
4909        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4910        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
4911
4912        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
4913        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
4914
4915        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
4916        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
4917    }
4918
4919    #[simd_test(enable = "sse2")]
4920    unsafe fn test_mm_cvtps_pd() {
4921        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
4922        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
4923
4924        let r = _mm_cvtps_pd(_mm_setr_ps(
4925            f32::MAX,
4926            f32::INFINITY,
4927            f32::NEG_INFINITY,
4928            f32::MIN,
4929        ));
4930        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
4931    }
4932
4933    #[simd_test(enable = "sse2")]
4934    unsafe fn test_mm_cvtpd_epi32() {
4935        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
4936        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
4937
4938        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
4939        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
4940
4941        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
4942        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4943
4944        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
4945        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4946
4947        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
4948        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4949    }
4950
4951    #[simd_test(enable = "sse2")]
4952    unsafe fn test_mm_cvtsd_si32() {
4953        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
4954        assert_eq!(r, -2);
4955
4956        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
4957        assert_eq!(r, i32::MIN);
4958
4959        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
4960        assert_eq!(r, i32::MIN);
4961    }
4962
4963    #[simd_test(enable = "sse2")]
4964    unsafe fn test_mm_cvtsd_ss() {
4965        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
4966        let b = _mm_setr_pd(2.0, -5.0);
4967
4968        let r = _mm_cvtsd_ss(a, b);
4969
4970        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
4971
4972        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
4973        let b = _mm_setr_pd(f64::INFINITY, -5.0);
4974
4975        let r = _mm_cvtsd_ss(a, b);
4976
4977        assert_eq_m128(
4978            r,
4979            _mm_setr_ps(
4980                f32::INFINITY,
4981                f32::NEG_INFINITY,
4982                f32::MAX,
4983                f32::NEG_INFINITY,
4984            ),
4985        );
4986    }
4987
4988    #[simd_test(enable = "sse2")]
4989    unsafe fn test_mm_cvtsd_f64() {
4990        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
4991        assert_eq!(r, -1.1);
4992    }
4993
4994    #[simd_test(enable = "sse2")]
4995    unsafe fn test_mm_cvtss_sd() {
4996        let a = _mm_setr_pd(-1.1, 2.2);
4997        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4998
4999        let r = _mm_cvtss_sd(a, b);
5000        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5001
5002        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5003        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5004
5005        let r = _mm_cvtss_sd(a, b);
5006        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5007    }
5008
5009    #[simd_test(enable = "sse2")]
5010    unsafe fn test_mm_cvttpd_epi32() {
5011        let a = _mm_setr_pd(-1.1, 2.2);
5012        let r = _mm_cvttpd_epi32(a);
5013        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5014
5015        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5016        let r = _mm_cvttpd_epi32(a);
5017        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5018    }
5019
5020    #[simd_test(enable = "sse2")]
5021    unsafe fn test_mm_cvttsd_si32() {
5022        let a = _mm_setr_pd(-1.1, 2.2);
5023        let r = _mm_cvttsd_si32(a);
5024        assert_eq!(r, -1);
5025
5026        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5027        let r = _mm_cvttsd_si32(a);
5028        assert_eq!(r, i32::MIN);
5029    }
5030
5031    #[simd_test(enable = "sse2")]
5032    unsafe fn test_mm_cvttps_epi32() {
5033        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5034        let r = _mm_cvttps_epi32(a);
5035        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5036
5037        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5038        let r = _mm_cvttps_epi32(a);
5039        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5040    }
5041
5042    #[simd_test(enable = "sse2")]
5043    unsafe fn test_mm_set_sd() {
5044        let r = _mm_set_sd(-1.0_f64);
5045        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5046    }
5047
5048    #[simd_test(enable = "sse2")]
5049    unsafe fn test_mm_set1_pd() {
5050        let r = _mm_set1_pd(-1.0_f64);
5051        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5052    }
5053
5054    #[simd_test(enable = "sse2")]
5055    unsafe fn test_mm_set_pd1() {
5056        let r = _mm_set_pd1(-2.0_f64);
5057        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5058    }
5059
5060    #[simd_test(enable = "sse2")]
5061    unsafe fn test_mm_set_pd() {
5062        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5063        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5064    }
5065
5066    #[simd_test(enable = "sse2")]
5067    unsafe fn test_mm_setr_pd() {
5068        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5069        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5070    }
5071
5072    #[simd_test(enable = "sse2")]
5073    unsafe fn test_mm_setzero_pd() {
5074        let r = _mm_setzero_pd();
5075        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5076    }
5077
5078    #[simd_test(enable = "sse2")]
5079    unsafe fn test_mm_load1_pd() {
5080        let d = -5.0;
5081        let r = _mm_load1_pd(&d);
5082        assert_eq_m128d(r, _mm_setr_pd(d, d));
5083    }
5084
5085    #[simd_test(enable = "sse2")]
5086    unsafe fn test_mm_load_pd1() {
5087        let d = -5.0;
5088        let r = _mm_load_pd1(&d);
5089        assert_eq_m128d(r, _mm_setr_pd(d, d));
5090    }
5091
5092    #[simd_test(enable = "sse2")]
5093    unsafe fn test_mm_unpackhi_pd() {
5094        let a = _mm_setr_pd(1.0, 2.0);
5095        let b = _mm_setr_pd(3.0, 4.0);
5096        let r = _mm_unpackhi_pd(a, b);
5097        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5098    }
5099
5100    #[simd_test(enable = "sse2")]
5101    unsafe fn test_mm_unpacklo_pd() {
5102        let a = _mm_setr_pd(1.0, 2.0);
5103        let b = _mm_setr_pd(3.0, 4.0);
5104        let r = _mm_unpacklo_pd(a, b);
5105        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5106    }
5107
5108    #[simd_test(enable = "sse2")]
5109    unsafe fn test_mm_shuffle_pd() {
5110        let a = _mm_setr_pd(1., 2.);
5111        let b = _mm_setr_pd(3., 4.);
5112        let expected = _mm_setr_pd(1., 3.);
5113        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5114        assert_eq_m128d(r, expected);
5115    }
5116
5117    #[simd_test(enable = "sse2")]
5118    unsafe fn test_mm_move_sd() {
5119        let a = _mm_setr_pd(1., 2.);
5120        let b = _mm_setr_pd(3., 4.);
5121        let expected = _mm_setr_pd(3., 2.);
5122        let r = _mm_move_sd(a, b);
5123        assert_eq_m128d(r, expected);
5124    }
5125
5126    #[simd_test(enable = "sse2")]
5127    unsafe fn test_mm_castpd_ps() {
5128        let a = _mm_set1_pd(0.);
5129        let expected = _mm_set1_ps(0.);
5130        let r = _mm_castpd_ps(a);
5131        assert_eq_m128(r, expected);
5132    }
5133
5134    #[simd_test(enable = "sse2")]
5135    unsafe fn test_mm_castpd_si128() {
5136        let a = _mm_set1_pd(0.);
5137        let expected = _mm_set1_epi64x(0);
5138        let r = _mm_castpd_si128(a);
5139        assert_eq_m128i(r, expected);
5140    }
5141
5142    #[simd_test(enable = "sse2")]
5143    unsafe fn test_mm_castps_pd() {
5144        let a = _mm_set1_ps(0.);
5145        let expected = _mm_set1_pd(0.);
5146        let r = _mm_castps_pd(a);
5147        assert_eq_m128d(r, expected);
5148    }
5149
5150    #[simd_test(enable = "sse2")]
5151    unsafe fn test_mm_castps_si128() {
5152        let a = _mm_set1_ps(0.);
5153        let expected = _mm_set1_epi32(0);
5154        let r = _mm_castps_si128(a);
5155        assert_eq_m128i(r, expected);
5156    }
5157
5158    #[simd_test(enable = "sse2")]
5159    unsafe fn test_mm_castsi128_pd() {
5160        let a = _mm_set1_epi64x(0);
5161        let expected = _mm_set1_pd(0.);
5162        let r = _mm_castsi128_pd(a);
5163        assert_eq_m128d(r, expected);
5164    }
5165
5166    #[simd_test(enable = "sse2")]
5167    unsafe fn test_mm_castsi128_ps() {
5168        let a = _mm_set1_epi32(0);
5169        let expected = _mm_set1_ps(0.);
5170        let r = _mm_castsi128_ps(a);
5171        assert_eq_m128(r, expected);
5172    }
5173}