core/stdarch/crates/core_arch/src/x86/
ssse3.rs

1//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
2
3use crate::{
4    core_arch::{simd::*, x86::*},
5    intrinsics::simd::*,
6};
7
8#[cfg(test)]
9use stdarch_test::assert_instr;
10
11/// Computes the absolute value of packed 8-bit signed integers in `a` and
12/// return the unsigned results.
13///
14/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
15#[inline]
16#[target_feature(enable = "ssse3")]
17#[cfg_attr(test, assert_instr(pabsb))]
18#[stable(feature = "simd_x86", since = "1.27.0")]
19pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
20    unsafe {
21        let a = a.as_i8x16();
22        let zero = i8x16::ZERO;
23        let r = simd_select::<m8x16, _>(simd_lt(a, zero), simd_neg(a), a);
24        transmute(r)
25    }
26}
27
28/// Computes the absolute value of each of the packed 16-bit signed integers in
29/// `a` and
30/// return the 16-bit unsigned integer
31///
32/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
33#[inline]
34#[target_feature(enable = "ssse3")]
35#[cfg_attr(test, assert_instr(pabsw))]
36#[stable(feature = "simd_x86", since = "1.27.0")]
37pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
38    unsafe {
39        let a = a.as_i16x8();
40        let zero = i16x8::ZERO;
41        let r = simd_select::<m16x8, _>(simd_lt(a, zero), simd_neg(a), a);
42        transmute(r)
43    }
44}
45
46/// Computes the absolute value of each of the packed 32-bit signed integers in
47/// `a` and
48/// return the 32-bit unsigned integer
49///
50/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
51#[inline]
52#[target_feature(enable = "ssse3")]
53#[cfg_attr(test, assert_instr(pabsd))]
54#[stable(feature = "simd_x86", since = "1.27.0")]
55pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
56    unsafe {
57        let a = a.as_i32x4();
58        let zero = i32x4::ZERO;
59        let r = simd_select::<m32x4, _>(simd_lt(a, zero), simd_neg(a), a);
60        transmute(r)
61    }
62}
63
64/// Shuffles bytes from `a` according to the content of `b`.
65///
66/// The last 4 bits of each byte of `b` are used as addresses
67/// into the 16 bytes of `a`.
68///
69/// In addition, if the highest significant bit of a byte of `b`
70/// is set, the respective destination byte is set to 0.
71///
72/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
73/// logically equivalent to:
74///
75/// ```
76/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
77///     let mut r = [0u8; 16];
78///     for i in 0..16 {
79///         // if the most significant bit of b is set,
80///         // then the destination byte is set to 0.
81///         if b[i] & 0x80 == 0u8 {
82///             r[i] = a[(b[i] % 16) as usize];
83///         }
84///     }
85///     r
86/// }
87/// ```
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
90#[inline]
91#[target_feature(enable = "ssse3")]
92#[cfg_attr(test, assert_instr(pshufb))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
95    unsafe { transmute(pshufb128(a.as_u8x16(), b.as_u8x16())) }
96}
97
98/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
99/// shift the result right by `n` bytes, and returns the low 16 bytes.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
102#[inline]
103#[target_feature(enable = "ssse3")]
104#[cfg_attr(test, assert_instr(palignr, IMM8 = 15))]
105#[rustc_legacy_const_generics(2)]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
108    static_assert_uimm_bits!(IMM8, 8);
109    // If palignr is shifting the pair of vectors more than the size of two
110    // lanes, emit zero.
111    if IMM8 > 32 {
112        return _mm_setzero_si128();
113    }
114    // If palignr is shifting the pair of input vectors more than one lane,
115    // but less than two lanes, convert to shifting in zeroes.
116    let (a, b) = if IMM8 > 16 {
117        (_mm_setzero_si128(), a)
118    } else {
119        (a, b)
120    };
121    const fn mask(shift: u32, i: u32) -> u32 {
122        if shift > 32 {
123            // Unused, but needs to be a valid index.
124            i
125        } else if shift > 16 {
126            shift - 16 + i
127        } else {
128            shift + i
129        }
130    }
131    unsafe {
132        let r: i8x16 = simd_shuffle!(
133            b.as_i8x16(),
134            a.as_i8x16(),
135            [
136                mask(IMM8 as u32, 0),
137                mask(IMM8 as u32, 1),
138                mask(IMM8 as u32, 2),
139                mask(IMM8 as u32, 3),
140                mask(IMM8 as u32, 4),
141                mask(IMM8 as u32, 5),
142                mask(IMM8 as u32, 6),
143                mask(IMM8 as u32, 7),
144                mask(IMM8 as u32, 8),
145                mask(IMM8 as u32, 9),
146                mask(IMM8 as u32, 10),
147                mask(IMM8 as u32, 11),
148                mask(IMM8 as u32, 12),
149                mask(IMM8 as u32, 13),
150                mask(IMM8 as u32, 14),
151                mask(IMM8 as u32, 15),
152            ],
153        );
154        transmute(r)
155    }
156}
157
158/// Horizontally adds the adjacent pairs of values contained in 2 packed
159/// 128-bit vectors of `[8 x i16]`.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
162#[inline]
163#[target_feature(enable = "ssse3")]
164#[cfg_attr(test, assert_instr(phaddw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
167    let a = a.as_i16x8();
168    let b = b.as_i16x8();
169    unsafe {
170        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
171        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
172        simd_add(even, odd).as_m128i()
173    }
174}
175
176/// Horizontally adds the adjacent pairs of values contained in 2 packed
177/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
178/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
181#[inline]
182#[target_feature(enable = "ssse3")]
183#[cfg_attr(test, assert_instr(phaddsw))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
186    unsafe { transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) }
187}
188
189/// Horizontally adds the adjacent pairs of values contained in 2 packed
190/// 128-bit vectors of `[4 x i32]`.
191///
192/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
193#[inline]
194#[target_feature(enable = "ssse3")]
195#[cfg_attr(test, assert_instr(phaddd))]
196#[stable(feature = "simd_x86", since = "1.27.0")]
197pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
198    let a = a.as_i32x4();
199    let b = b.as_i32x4();
200    unsafe {
201        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
202        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
203        simd_add(even, odd).as_m128i()
204    }
205}
206
207/// Horizontally subtract the adjacent pairs of values contained in 2
208/// packed 128-bit vectors of `[8 x i16]`.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
211#[inline]
212#[target_feature(enable = "ssse3")]
213#[cfg_attr(test, assert_instr(phsubw))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
216    let a = a.as_i16x8();
217    let b = b.as_i16x8();
218    unsafe {
219        let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]);
220        let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]);
221        simd_sub(even, odd).as_m128i()
222    }
223}
224
225/// Horizontally subtract the adjacent pairs of values contained in 2
226/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
227/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
228/// saturated to 8000h.
229///
230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
231#[inline]
232#[target_feature(enable = "ssse3")]
233#[cfg_attr(test, assert_instr(phsubsw))]
234#[stable(feature = "simd_x86", since = "1.27.0")]
235pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
236    unsafe { transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) }
237}
238
239/// Horizontally subtract the adjacent pairs of values contained in 2
240/// packed 128-bit vectors of `[4 x i32]`.
241///
242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
243#[inline]
244#[target_feature(enable = "ssse3")]
245#[cfg_attr(test, assert_instr(phsubd))]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
248    let a = a.as_i32x4();
249    let b = b.as_i32x4();
250    unsafe {
251        let even: i32x4 = simd_shuffle!(a, b, [0, 2, 4, 6]);
252        let odd: i32x4 = simd_shuffle!(a, b, [1, 3, 5, 7]);
253        simd_sub(even, odd).as_m128i()
254    }
255}
256
257/// Multiplies corresponding pairs of packed 8-bit unsigned integer
258/// values contained in the first source operand and packed 8-bit signed
259/// integer values contained in the second source operand, add pairs of
260/// contiguous products with signed saturation, and writes the 16-bit sums to
261/// the corresponding bits in the destination.
262///
263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
264#[inline]
265#[target_feature(enable = "ssse3")]
266#[cfg_attr(test, assert_instr(pmaddubsw))]
267#[stable(feature = "simd_x86", since = "1.27.0")]
268pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
269    unsafe { transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16())) }
270}
271
272/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
273/// product to the 18 most significant bits by right-shifting, round the
274/// truncated value by adding 1, and write bits `[16:1]` to the destination.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
277#[inline]
278#[target_feature(enable = "ssse3")]
279#[cfg_attr(test, assert_instr(pmulhrsw))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
282    unsafe { transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8())) }
283}
284
285/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
286/// integer in `b` is negative, and returns the result.
287/// Elements in result are zeroed out when the corresponding element in `b`
288/// is zero.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
291#[inline]
292#[target_feature(enable = "ssse3")]
293#[cfg_attr(test, assert_instr(psignb))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
296    unsafe { transmute(psignb128(a.as_i8x16(), b.as_i8x16())) }
297}
298
299/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
300/// integer in `b` is negative, and returns the results.
301/// Elements in result are zeroed out when the corresponding element in `b`
302/// is zero.
303///
304/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
305#[inline]
306#[target_feature(enable = "ssse3")]
307#[cfg_attr(test, assert_instr(psignw))]
308#[stable(feature = "simd_x86", since = "1.27.0")]
309pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
310    unsafe { transmute(psignw128(a.as_i16x8(), b.as_i16x8())) }
311}
312
313/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
314/// integer in `b` is negative, and returns the results.
315/// Element in result are zeroed out when the corresponding element in `b`
316/// is zero.
317///
318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
319#[inline]
320#[target_feature(enable = "ssse3")]
321#[cfg_attr(test, assert_instr(psignd))]
322#[stable(feature = "simd_x86", since = "1.27.0")]
323pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
324    unsafe { transmute(psignd128(a.as_i32x4(), b.as_i32x4())) }
325}
326
327#[allow(improper_ctypes)]
328unsafe extern "C" {
329    #[link_name = "llvm.x86.ssse3.pshuf.b.128"]
330    fn pshufb128(a: u8x16, b: u8x16) -> u8x16;
331
332    #[link_name = "llvm.x86.ssse3.phadd.sw.128"]
333    fn phaddsw128(a: i16x8, b: i16x8) -> i16x8;
334
335    #[link_name = "llvm.x86.ssse3.phsub.sw.128"]
336    fn phsubsw128(a: i16x8, b: i16x8) -> i16x8;
337
338    #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"]
339    fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
340
341    #[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"]
342    fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8;
343
344    #[link_name = "llvm.x86.ssse3.psign.b.128"]
345    fn psignb128(a: i8x16, b: i8x16) -> i8x16;
346
347    #[link_name = "llvm.x86.ssse3.psign.w.128"]
348    fn psignw128(a: i16x8, b: i16x8) -> i16x8;
349
350    #[link_name = "llvm.x86.ssse3.psign.d.128"]
351    fn psignd128(a: i32x4, b: i32x4) -> i32x4;
352}
353
354#[cfg(test)]
355mod tests {
356    use stdarch_test::simd_test;
357
358    use crate::core_arch::x86::*;
359
360    #[simd_test(enable = "ssse3")]
361    unsafe fn test_mm_abs_epi8() {
362        let r = _mm_abs_epi8(_mm_set1_epi8(-5));
363        assert_eq_m128i(r, _mm_set1_epi8(5));
364    }
365
366    #[simd_test(enable = "ssse3")]
367    unsafe fn test_mm_abs_epi16() {
368        let r = _mm_abs_epi16(_mm_set1_epi16(-5));
369        assert_eq_m128i(r, _mm_set1_epi16(5));
370    }
371
372    #[simd_test(enable = "ssse3")]
373    unsafe fn test_mm_abs_epi32() {
374        let r = _mm_abs_epi32(_mm_set1_epi32(-5));
375        assert_eq_m128i(r, _mm_set1_epi32(5));
376    }
377
378    #[simd_test(enable = "ssse3")]
379    unsafe fn test_mm_shuffle_epi8() {
380        #[rustfmt::skip]
381        let a = _mm_setr_epi8(
382            1, 2, 3, 4, 5, 6, 7, 8,
383            9, 10, 11, 12, 13, 14, 15, 16,
384        );
385        #[rustfmt::skip]
386        let b = _mm_setr_epi8(
387            4, 128_u8 as i8, 4, 3,
388            24, 12, 6, 19,
389            12, 5, 5, 10,
390            4, 1, 8, 0,
391        );
392        let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
393        let r = _mm_shuffle_epi8(a, b);
394        assert_eq_m128i(r, expected);
395
396        // Test indices greater than 15 wrapping around
397        let b = _mm_add_epi8(b, _mm_set1_epi8(32));
398        let r = _mm_shuffle_epi8(a, b);
399        assert_eq_m128i(r, expected);
400    }
401
402    #[simd_test(enable = "ssse3")]
403    unsafe fn test_mm_alignr_epi8() {
404        #[rustfmt::skip]
405        let a = _mm_setr_epi8(
406            1, 2, 3, 4, 5, 6, 7, 8,
407            9, 10, 11, 12, 13, 14, 15, 16,
408        );
409        #[rustfmt::skip]
410        let b = _mm_setr_epi8(
411            4, 63, 4, 3,
412            24, 12, 6, 19,
413            12, 5, 5, 10,
414            4, 1, 8, 0,
415        );
416        let r = _mm_alignr_epi8::<33>(a, b);
417        assert_eq_m128i(r, _mm_set1_epi8(0));
418
419        let r = _mm_alignr_epi8::<17>(a, b);
420        #[rustfmt::skip]
421        let expected = _mm_setr_epi8(
422            2, 3, 4, 5, 6, 7, 8, 9,
423            10, 11, 12, 13, 14, 15, 16, 0,
424        );
425        assert_eq_m128i(r, expected);
426
427        let r = _mm_alignr_epi8::<16>(a, b);
428        assert_eq_m128i(r, a);
429
430        let r = _mm_alignr_epi8::<15>(a, b);
431        #[rustfmt::skip]
432        let expected = _mm_setr_epi8(
433            0, 1, 2, 3, 4, 5, 6, 7,
434            8, 9, 10, 11, 12, 13, 14, 15,
435        );
436        assert_eq_m128i(r, expected);
437
438        let r = _mm_alignr_epi8::<0>(a, b);
439        assert_eq_m128i(r, b);
440    }
441
442    #[simd_test(enable = "ssse3")]
443    unsafe fn test_mm_hadd_epi16() {
444        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
445        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
446        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
447        let r = _mm_hadd_epi16(a, b);
448        assert_eq_m128i(r, expected);
449
450        // Test wrapping on overflow
451        let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
452        let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
453        let expected = _mm_setr_epi16(
454            i16::MIN,
455            i16::MIN + 1,
456            i16::MIN + 2,
457            i16::MIN + 3,
458            i16::MAX,
459            i16::MAX - 1,
460            i16::MAX - 2,
461            i16::MAX - 3,
462        );
463        let r = _mm_hadd_epi16(a, b);
464        assert_eq_m128i(r, expected);
465    }
466
467    #[simd_test(enable = "ssse3")]
468    unsafe fn test_mm_hadds_epi16() {
469        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
470        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
471        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
472        let r = _mm_hadds_epi16(a, b);
473        assert_eq_m128i(r, expected);
474
475        // Test saturating on overflow
476        let a = _mm_setr_epi16(i16::MAX, 1, i16::MAX, 2, i16::MAX, 3, i16::MAX, 4);
477        let b = _mm_setr_epi16(i16::MIN, -1, i16::MIN, -2, i16::MIN, -3, i16::MIN, -4);
478        let expected = _mm_setr_epi16(
479            i16::MAX,
480            i16::MAX,
481            i16::MAX,
482            i16::MAX,
483            i16::MIN,
484            i16::MIN,
485            i16::MIN,
486            i16::MIN,
487        );
488        let r = _mm_hadds_epi16(a, b);
489        assert_eq_m128i(r, expected);
490    }
491
492    #[simd_test(enable = "ssse3")]
493    unsafe fn test_mm_hadd_epi32() {
494        let a = _mm_setr_epi32(1, 2, 3, 4);
495        let b = _mm_setr_epi32(4, 128, 4, 3);
496        let expected = _mm_setr_epi32(3, 7, 132, 7);
497        let r = _mm_hadd_epi32(a, b);
498        assert_eq_m128i(r, expected);
499
500        // Test wrapping on overflow
501        let a = _mm_setr_epi32(i32::MAX, 1, i32::MAX, 2);
502        let b = _mm_setr_epi32(i32::MIN, -1, i32::MIN, -2);
503        let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
504        let r = _mm_hadd_epi32(a, b);
505        assert_eq_m128i(r, expected);
506    }
507
508    #[simd_test(enable = "ssse3")]
509    unsafe fn test_mm_hsub_epi16() {
510        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
511        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
512        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
513        let r = _mm_hsub_epi16(a, b);
514        assert_eq_m128i(r, expected);
515
516        // Test wrapping on overflow
517        let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
518        let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
519        let expected = _mm_setr_epi16(
520            i16::MIN,
521            i16::MIN + 1,
522            i16::MIN + 2,
523            i16::MIN + 3,
524            i16::MAX,
525            i16::MAX - 1,
526            i16::MAX - 2,
527            i16::MAX - 3,
528        );
529        let r = _mm_hsub_epi16(a, b);
530        assert_eq_m128i(r, expected);
531    }
532
533    #[simd_test(enable = "ssse3")]
534    unsafe fn test_mm_hsubs_epi16() {
535        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
536        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
537        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
538        let r = _mm_hsubs_epi16(a, b);
539        assert_eq_m128i(r, expected);
540
541        // Test saturating on overflow
542        let a = _mm_setr_epi16(i16::MAX, -1, i16::MAX, -2, i16::MAX, -3, i16::MAX, -4);
543        let b = _mm_setr_epi16(i16::MIN, 1, i16::MIN, 2, i16::MIN, 3, i16::MIN, 4);
544        let expected = _mm_setr_epi16(
545            i16::MAX,
546            i16::MAX,
547            i16::MAX,
548            i16::MAX,
549            i16::MIN,
550            i16::MIN,
551            i16::MIN,
552            i16::MIN,
553        );
554        let r = _mm_hsubs_epi16(a, b);
555        assert_eq_m128i(r, expected);
556    }
557
558    #[simd_test(enable = "ssse3")]
559    unsafe fn test_mm_hsub_epi32() {
560        let a = _mm_setr_epi32(1, 2, 3, 4);
561        let b = _mm_setr_epi32(4, 128, 4, 3);
562        let expected = _mm_setr_epi32(-1, -1, -124, 1);
563        let r = _mm_hsub_epi32(a, b);
564        assert_eq_m128i(r, expected);
565
566        // Test wrapping on overflow
567        let a = _mm_setr_epi32(i32::MAX, -1, i32::MAX, -2);
568        let b = _mm_setr_epi32(i32::MIN, 1, i32::MIN, 2);
569        let expected = _mm_setr_epi32(i32::MIN, i32::MIN + 1, i32::MAX, i32::MAX - 1);
570        let r = _mm_hsub_epi32(a, b);
571        assert_eq_m128i(r, expected);
572    }
573
574    #[simd_test(enable = "ssse3")]
575    unsafe fn test_mm_maddubs_epi16() {
576        #[rustfmt::skip]
577        let a = _mm_setr_epi8(
578            1, 2, 3, 4, 5, 6, 7, 8,
579            9, 10, 11, 12, 13, 14, 15, 16,
580        );
581        #[rustfmt::skip]
582        let b = _mm_setr_epi8(
583            4, 63, 4, 3,
584            24, 12, 6, 19,
585            12, 5, 5, 10,
586            4, 1, 8, 0,
587        );
588        let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
589        let r = _mm_maddubs_epi16(a, b);
590        assert_eq_m128i(r, expected);
591
592        // Test widening and saturation
593        #[rustfmt::skip]
594        let a = _mm_setr_epi8(
595            u8::MAX as i8, u8::MAX as i8,
596            u8::MAX as i8, u8::MAX as i8,
597            u8::MAX as i8, u8::MAX as i8,
598            100, 100, 0, 0,
599            0, 0, 0, 0, 0, 0,
600        );
601        #[rustfmt::skip]
602        let b = _mm_setr_epi8(
603            i8::MAX, i8::MAX,
604            i8::MAX, i8::MIN,
605            i8::MIN, i8::MIN,
606            50, 15, 0, 0, 0,
607            0, 0, 0, 0, 0,
608        );
609        let expected = _mm_setr_epi16(i16::MAX, -255, i16::MIN, 6500, 0, 0, 0, 0);
610        let r = _mm_maddubs_epi16(a, b);
611        assert_eq_m128i(r, expected);
612    }
613
614    #[simd_test(enable = "ssse3")]
615    unsafe fn test_mm_mulhrs_epi16() {
616        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
617        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
618        let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
619        let r = _mm_mulhrs_epi16(a, b);
620        assert_eq_m128i(r, expected);
621
622        // Test extreme values
623        let a = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MIN, 0, 0, 0, 0, 0);
624        let b = _mm_setr_epi16(i16::MAX, i16::MIN, i16::MAX, 0, 0, 0, 0, 0);
625        let expected = _mm_setr_epi16(i16::MAX - 1, i16::MIN, -i16::MAX, 0, 0, 0, 0, 0);
626        let r = _mm_mulhrs_epi16(a, b);
627        assert_eq_m128i(r, expected);
628    }
629
630    #[simd_test(enable = "ssse3")]
631    unsafe fn test_mm_sign_epi8() {
632        #[rustfmt::skip]
633        let a = _mm_setr_epi8(
634            1, 2, 3, 4, 5, 6, 7, 8,
635            9, 10, 11, 12, 13, -14, -15, 16,
636        );
637        #[rustfmt::skip]
638        let b = _mm_setr_epi8(
639            4, 63, -4, 3, 24, 12, -6, -19,
640            12, 5, -5, 10, 4, 1, -8, 0,
641        );
642        #[rustfmt::skip]
643        let expected = _mm_setr_epi8(
644            1, 2, -3, 4, 5, 6, -7, -8,
645            9, 10, -11, 12, 13, -14, 15, 0,
646        );
647        let r = _mm_sign_epi8(a, b);
648        assert_eq_m128i(r, expected);
649    }
650
651    #[simd_test(enable = "ssse3")]
652    unsafe fn test_mm_sign_epi16() {
653        let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
654        let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
655        let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
656        let r = _mm_sign_epi16(a, b);
657        assert_eq_m128i(r, expected);
658    }
659
660    #[simd_test(enable = "ssse3")]
661    unsafe fn test_mm_sign_epi32() {
662        let a = _mm_setr_epi32(-1, 2, 3, 4);
663        let b = _mm_setr_epi32(1, -1, 1, 0);
664        let expected = _mm_setr_epi32(-1, -2, 3, 0);
665        let r = _mm_sign_epi32(a, b);
666        assert_eq_m128i(r, expected);
667    }
668}