Skip to main content

miri/shims/x86/
mod.rs

1use rustc_abi::{CanonAbi, FieldIdx, Size};
2use rustc_apfloat::Float;
3use rustc_apfloat::ieee::Single;
4use rustc_middle::ty::Ty;
5use rustc_middle::{mir, ty};
6use rustc_span::Symbol;
7use rustc_target::callconv::FnAbi;
8use rustc_target::spec::Arch;
9
10use self::helpers::bool_to_simd_element;
11use crate::*;
12
13mod aesni;
14mod avx;
15mod avx2;
16mod avx512;
17mod bmi;
18mod gfni;
19mod sha;
20mod sse;
21mod sse2;
22mod sse3;
23mod sse41;
24mod sse42;
25mod ssse3;
26
27impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {}
28pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
29    fn emulate_x86_intrinsic(
30        &mut self,
31        link_name: Symbol,
32        abi: &FnAbi<'tcx, Ty<'tcx>>,
33        args: &[OpTy<'tcx>],
34        dest: &MPlaceTy<'tcx>,
35    ) -> InterpResult<'tcx, EmulateItemResult> {
36        let this = self.eval_context_mut();
37        // Prefix should have already been checked.
38        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.").unwrap();
39        match unprefixed_name {
40            // Used to implement the `_addcarry_u{32, 64}` and the `_subborrow_u{32, 64}` functions.
41            // Computes a + b or a - b with input and output carry/borrow. The input carry/borrow is an 8-bit
42            // value, which is interpreted as 1 if it is non-zero. The output carry/borrow is an 8-bit value that will be 0 or 1.
43            // https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/addcarry-u32-addcarry-u64.html
44            // https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/subborrow-u32-subborrow-u64.html
45            "addcarry.32" | "addcarry.64" | "subborrow.32" | "subborrow.64" => {
46                if unprefixed_name.ends_with("64") && this.tcx.sess.target.arch != Arch::X86_64 {
47                    return interp_ok(EmulateItemResult::NotSupported);
48                }
49
50                let [cb_in, a, b] =
51                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
52                let op = if unprefixed_name.starts_with("add") {
53                    mir::BinOp::AddWithOverflow
54                } else {
55                    mir::BinOp::SubWithOverflow
56                };
57
58                let (sum, cb_out) = carrying_add(this, cb_in, a, b, op)?;
59                this.write_scalar(cb_out, &this.project_field(dest, FieldIdx::ZERO)?)?;
60                this.write_immediate(*sum, &this.project_field(dest, FieldIdx::ONE)?)?;
61            }
62
63            // Used to implement the `_mm_pause` function.
64            // The intrinsic is used to hint the processor that the code is in a spin-loop.
65            // It is compiled down to a `pause` instruction. When SSE2 is not available,
66            // the instruction behaves like a no-op, so it is always safe to call the
67            // intrinsic.
68            "sse2.pause" => {
69                let [] = this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
70                // Only exhibit the spin-loop hint behavior when SSE2 is enabled.
71                if this.tcx.sess.unstable_target_features.contains(&Symbol::intern("sse2")) {
72                    this.yield_active_thread();
73                }
74            }
75
76            "pclmulqdq" | "pclmulqdq.256" | "pclmulqdq.512" => {
77                let mut len = 2; // in units of 64bits
78                this.expect_target_feature_for_intrinsic(link_name, "pclmulqdq")?;
79                if unprefixed_name.ends_with(".256") {
80                    this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
81                    len = 4;
82                } else if unprefixed_name.ends_with(".512") {
83                    this.expect_target_feature_for_intrinsic(link_name, "vpclmulqdq")?;
84                    this.expect_target_feature_for_intrinsic(link_name, "avx512f")?;
85                    len = 8;
86                }
87
88                let [left, right, imm] =
89                    this.check_shim_sig_lenient(abi, CanonAbi::C, link_name, args)?;
90
91                pclmulqdq(this, left, right, imm, dest, len)?;
92            }
93
94            name if name.starts_with("bmi.") => {
95                return bmi::EvalContextExt::emulate_x86_bmi_intrinsic(
96                    this, link_name, abi, args, dest,
97                );
98            }
99            // The GFNI extension does not get its own namespace.
100            // Check for instruction names instead.
101            name if name.starts_with("vgf2p8affine") || name.starts_with("vgf2p8mulb") => {
102                return gfni::EvalContextExt::emulate_x86_gfni_intrinsic(
103                    this, link_name, abi, args, dest,
104                );
105            }
106            name if name.starts_with("sha") => {
107                return sha::EvalContextExt::emulate_x86_sha_intrinsic(
108                    this, link_name, abi, args, dest,
109                );
110            }
111            name if name.starts_with("sse.") => {
112                return sse::EvalContextExt::emulate_x86_sse_intrinsic(
113                    this, link_name, abi, args, dest,
114                );
115            }
116            name if name.starts_with("sse2.") => {
117                return sse2::EvalContextExt::emulate_x86_sse2_intrinsic(
118                    this, link_name, abi, args, dest,
119                );
120            }
121            name if name.starts_with("sse3.") => {
122                return sse3::EvalContextExt::emulate_x86_sse3_intrinsic(
123                    this, link_name, abi, args, dest,
124                );
125            }
126            name if name.starts_with("ssse3.") => {
127                return ssse3::EvalContextExt::emulate_x86_ssse3_intrinsic(
128                    this, link_name, abi, args, dest,
129                );
130            }
131            name if name.starts_with("sse41.") => {
132                return sse41::EvalContextExt::emulate_x86_sse41_intrinsic(
133                    this, link_name, abi, args, dest,
134                );
135            }
136            name if name.starts_with("sse42.") => {
137                return sse42::EvalContextExt::emulate_x86_sse42_intrinsic(
138                    this, link_name, abi, args, dest,
139                );
140            }
141            name if name.starts_with("aesni.") => {
142                return aesni::EvalContextExt::emulate_x86_aesni_intrinsic(
143                    this, link_name, abi, args, dest,
144                );
145            }
146            name if name.starts_with("avx.") => {
147                return avx::EvalContextExt::emulate_x86_avx_intrinsic(
148                    this, link_name, abi, args, dest,
149                );
150            }
151            name if name.starts_with("avx2.") => {
152                return avx2::EvalContextExt::emulate_x86_avx2_intrinsic(
153                    this, link_name, abi, args, dest,
154                );
155            }
156            name if name.starts_with("avx512.") => {
157                return avx512::EvalContextExt::emulate_x86_avx512_intrinsic(
158                    this, link_name, abi, args, dest,
159                );
160            }
161
162            _ => return interp_ok(EmulateItemResult::NotSupported),
163        }
164        interp_ok(EmulateItemResult::NeedsReturn)
165    }
166}
167
168#[derive(Copy, Clone)]
169enum FloatBinOp {
170    /// Comparison
171    ///
172    /// The semantics of this operator is a case distinction: we compare the two operands,
173    /// and then we return one of the four booleans `gt`, `lt`, `eq`, `unord` depending on
174    /// which class they fall into.
175    ///
176    /// AVX supports all 16 combinations, SSE only a subset
177    ///
178    /// <https://www.felixcloutier.com/x86/cmpss>
179    /// <https://www.felixcloutier.com/x86/cmpps>
180    /// <https://www.felixcloutier.com/x86/cmpsd>
181    /// <https://www.felixcloutier.com/x86/cmppd>
182    Cmp {
183        /// Result when lhs < rhs
184        gt: bool,
185        /// Result when lhs > rhs
186        lt: bool,
187        /// Result when lhs == rhs
188        eq: bool,
189        /// Result when lhs is NaN or rhs is NaN
190        unord: bool,
191    },
192    /// Minimum value (with SSE semantics)
193    ///
194    /// <https://www.felixcloutier.com/x86/minss>
195    /// <https://www.felixcloutier.com/x86/minps>
196    /// <https://www.felixcloutier.com/x86/minsd>
197    /// <https://www.felixcloutier.com/x86/minpd>
198    Min,
199    /// Maximum value (with SSE semantics)
200    ///
201    /// <https://www.felixcloutier.com/x86/maxss>
202    /// <https://www.felixcloutier.com/x86/maxps>
203    /// <https://www.felixcloutier.com/x86/maxsd>
204    /// <https://www.felixcloutier.com/x86/maxpd>
205    Max,
206}
207
208impl FloatBinOp {
209    /// Convert from the `imm` argument used to specify the comparison
210    /// operation in intrinsics such as `llvm.x86.sse.cmp.ss`.
211    fn cmp_from_imm<'tcx>(
212        ecx: &crate::MiriInterpCx<'tcx>,
213        imm: i8,
214        intrinsic: Symbol,
215    ) -> InterpResult<'tcx, Self> {
216        // Only bits 0..=4 are used, remaining should be zero.
217        if imm & !0b1_1111 != 0 {
218            panic!("invalid `imm` parameter of {intrinsic}: 0x{imm:x}");
219        }
220        // Bit 4 specifies whether the operation is quiet or signaling, which
221        // we do not care in Miri.
222        // Bits 0..=2 specifies the operation.
223        // `gt` indicates the result to be returned when the LHS is strictly
224        // greater than the RHS, and so on.
225        let (gt, lt, eq, mut unord) = match imm & 0b111 {
226            // Equal
227            0x0 => (false, false, true, false),
228            // Less-than
229            0x1 => (false, true, false, false),
230            // Less-or-equal
231            0x2 => (false, true, true, false),
232            // Unordered (either is NaN)
233            0x3 => (false, false, false, true),
234            // Not equal
235            0x4 => (true, true, false, true),
236            // Not less-than
237            0x5 => (true, false, true, true),
238            // Not less-or-equal
239            0x6 => (true, false, false, true),
240            // Ordered (neither is NaN)
241            0x7 => (true, true, true, false),
242            _ => unreachable!(),
243        };
244        // When bit 3 is 1 (only possible in AVX), unord is toggled.
245        if imm & 0b1000 != 0 {
246            ecx.expect_target_feature_for_intrinsic(intrinsic, "avx")?;
247            unord = !unord;
248        }
249        interp_ok(Self::Cmp { gt, lt, eq, unord })
250    }
251}
252
253/// Performs `which` scalar operation on `left` and `right` and returns
254/// the result.
255fn bin_op_float<'tcx, F: rustc_apfloat::Float>(
256    which: FloatBinOp,
257    left: &ImmTy<'tcx>,
258    right: &ImmTy<'tcx>,
259) -> InterpResult<'tcx, Scalar> {
260    match which {
261        FloatBinOp::Cmp { gt, lt, eq, unord } => {
262            let left = left.to_scalar().to_float::<F>()?;
263            let right = right.to_scalar().to_float::<F>()?;
264
265            let res = match left.partial_cmp(&right) {
266                None => unord,
267                Some(std::cmp::Ordering::Less) => lt,
268                Some(std::cmp::Ordering::Equal) => eq,
269                Some(std::cmp::Ordering::Greater) => gt,
270            };
271            interp_ok(bool_to_simd_element(res, Size::from_bits(F::BITS)))
272        }
273        FloatBinOp::Min => {
274            let left_scalar = left.to_scalar();
275            let left = left_scalar.to_float::<F>()?;
276            let right_scalar = right.to_scalar();
277            let right = right_scalar.to_float::<F>()?;
278            // SSE semantics to handle zero and NaN. Note that `x == F::ZERO`
279            // is true when `x` is either +0 or -0.
280            if (left == F::ZERO && right == F::ZERO)
281                || left.is_nan()
282                || right.is_nan()
283                || left >= right
284            {
285                interp_ok(right_scalar)
286            } else {
287                interp_ok(left_scalar)
288            }
289        }
290        FloatBinOp::Max => {
291            let left_scalar = left.to_scalar();
292            let left = left_scalar.to_float::<F>()?;
293            let right_scalar = right.to_scalar();
294            let right = right_scalar.to_float::<F>()?;
295            // SSE semantics to handle zero and NaN. Note that `x == F::ZERO`
296            // is true when `x` is either +0 or -0.
297            if (left == F::ZERO && right == F::ZERO)
298                || left.is_nan()
299                || right.is_nan()
300                || left <= right
301            {
302                interp_ok(right_scalar)
303            } else {
304                interp_ok(left_scalar)
305            }
306        }
307    }
308}
309
310/// Performs `which` operation on the first component of `left` and `right`
311/// and copies the other components from `left`. The result is stored in `dest`.
312fn bin_op_simd_float_first<'tcx, F: rustc_apfloat::Float>(
313    ecx: &mut crate::MiriInterpCx<'tcx>,
314    which: FloatBinOp,
315    left: &OpTy<'tcx>,
316    right: &OpTy<'tcx>,
317    dest: &MPlaceTy<'tcx>,
318) -> InterpResult<'tcx, ()> {
319    let (left, left_len) = ecx.project_to_simd(left)?;
320    let (right, right_len) = ecx.project_to_simd(right)?;
321    let (dest, dest_len) = ecx.project_to_simd(dest)?;
322
323    assert_eq!(dest_len, left_len);
324    assert_eq!(dest_len, right_len);
325
326    let res0 = bin_op_float::<F>(
327        which,
328        &ecx.read_immediate(&ecx.project_index(&left, 0)?)?,
329        &ecx.read_immediate(&ecx.project_index(&right, 0)?)?,
330    )?;
331    ecx.write_scalar(res0, &ecx.project_index(&dest, 0)?)?;
332
333    for i in 1..dest_len {
334        ecx.copy_op(&ecx.project_index(&left, i)?, &ecx.project_index(&dest, i)?)?;
335    }
336
337    interp_ok(())
338}
339
340/// Performs `which` operation on each component of `left` and
341/// `right`, storing the result is stored in `dest`.
342fn bin_op_simd_float_all<'tcx, F: rustc_apfloat::Float>(
343    ecx: &mut crate::MiriInterpCx<'tcx>,
344    which: FloatBinOp,
345    left: &OpTy<'tcx>,
346    right: &OpTy<'tcx>,
347    dest: &MPlaceTy<'tcx>,
348) -> InterpResult<'tcx, ()> {
349    let (left, left_len) = ecx.project_to_simd(left)?;
350    let (right, right_len) = ecx.project_to_simd(right)?;
351    let (dest, dest_len) = ecx.project_to_simd(dest)?;
352
353    assert_eq!(dest_len, left_len);
354    assert_eq!(dest_len, right_len);
355
356    for i in 0..dest_len {
357        let left = ecx.read_immediate(&ecx.project_index(&left, i)?)?;
358        let right = ecx.read_immediate(&ecx.project_index(&right, i)?)?;
359        let dest = ecx.project_index(&dest, i)?;
360
361        let res = bin_op_float::<F>(which, &left, &right)?;
362        ecx.write_scalar(res, &dest)?;
363    }
364
365    interp_ok(())
366}
367
368#[derive(Copy, Clone)]
369enum FloatUnaryOp {
370    /// Approximation of 1/x
371    ///
372    /// <https://www.felixcloutier.com/x86/rcpss>
373    /// <https://www.felixcloutier.com/x86/rcpps>
374    Rcp,
375    /// Approximation of 1/sqrt(x)
376    ///
377    /// <https://www.felixcloutier.com/x86/rsqrtss>
378    /// <https://www.felixcloutier.com/x86/rsqrtps>
379    Rsqrt,
380}
381
382/// Performs `which` scalar operation on `op` and returns the result.
383fn unary_op_f32<'tcx>(
384    ecx: &mut crate::MiriInterpCx<'tcx>,
385    which: FloatUnaryOp,
386    op: &ImmTy<'tcx>,
387) -> InterpResult<'tcx, Scalar> {
388    match which {
389        FloatUnaryOp::Rcp => {
390            let op = op.to_scalar().to_f32()?;
391            let div = (Single::from_u128(1).value / op).value;
392            // Apply a relative error with a magnitude on the order of 2^-12 to simulate the
393            // inaccuracy of RCP.
394            let res = math::apply_random_float_error(ecx, div, -12);
395            interp_ok(Scalar::from_f32(res))
396        }
397        FloatUnaryOp::Rsqrt => {
398            let op = op.to_scalar().to_f32()?;
399            let rsqrt = (Single::from_u128(1).value / math::sqrt(op)).value;
400            // Apply a relative error with a magnitude on the order of 2^-12 to simulate the
401            // inaccuracy of RSQRT.
402            let res = math::apply_random_float_error(ecx, rsqrt, -12);
403            interp_ok(Scalar::from_f32(res))
404        }
405    }
406}
407
408/// Performs `which` operation on the first component of `op` and copies
409/// the other components. The result is stored in `dest`.
410fn unary_op_ss<'tcx>(
411    ecx: &mut crate::MiriInterpCx<'tcx>,
412    which: FloatUnaryOp,
413    op: &OpTy<'tcx>,
414    dest: &MPlaceTy<'tcx>,
415) -> InterpResult<'tcx, ()> {
416    let (op, op_len) = ecx.project_to_simd(op)?;
417    let (dest, dest_len) = ecx.project_to_simd(dest)?;
418
419    assert_eq!(dest_len, op_len);
420
421    let res0 = unary_op_f32(ecx, which, &ecx.read_immediate(&ecx.project_index(&op, 0)?)?)?;
422    ecx.write_scalar(res0, &ecx.project_index(&dest, 0)?)?;
423
424    for i in 1..dest_len {
425        ecx.copy_op(&ecx.project_index(&op, i)?, &ecx.project_index(&dest, i)?)?;
426    }
427
428    interp_ok(())
429}
430
431/// Performs `which` operation on each component of `op`, storing the
432/// result is stored in `dest`.
433fn unary_op_ps<'tcx>(
434    ecx: &mut crate::MiriInterpCx<'tcx>,
435    which: FloatUnaryOp,
436    op: &OpTy<'tcx>,
437    dest: &MPlaceTy<'tcx>,
438) -> InterpResult<'tcx, ()> {
439    let (op, op_len) = ecx.project_to_simd(op)?;
440    let (dest, dest_len) = ecx.project_to_simd(dest)?;
441
442    assert_eq!(dest_len, op_len);
443
444    for i in 0..dest_len {
445        let op = ecx.read_immediate(&ecx.project_index(&op, i)?)?;
446        let dest = ecx.project_index(&dest, i)?;
447
448        let res = unary_op_f32(ecx, which, &op)?;
449        ecx.write_scalar(res, &dest)?;
450    }
451
452    interp_ok(())
453}
454
455enum ShiftOp {
456    /// Shift left, logically (shift in zeros) -- same as shift left, arithmetically
457    Left,
458    /// Shift right, logically (shift in zeros)
459    RightLogic,
460    /// Shift right, arithmetically (shift in sign)
461    RightArith,
462}
463
464/// Shifts each element of `left` by a scalar amount. The shift amount
465/// is determined by the lowest 64 bits of `right` (which is a 128-bit vector).
466///
467/// For logic shifts, when right is larger than BITS - 1, zero is produced.
468/// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
469/// bit is copied to all bits.
470fn shift_simd_by_scalar<'tcx>(
471    ecx: &mut crate::MiriInterpCx<'tcx>,
472    left: &OpTy<'tcx>,
473    right: &OpTy<'tcx>,
474    which: ShiftOp,
475    dest: &MPlaceTy<'tcx>,
476) -> InterpResult<'tcx, ()> {
477    let (left, left_len) = ecx.project_to_simd(left)?;
478    let (dest, dest_len) = ecx.project_to_simd(dest)?;
479
480    assert_eq!(dest_len, left_len);
481    // `right` may have a different length, and we only care about its
482    // lowest 64bit anyway.
483
484    // Get the 64-bit shift operand and convert it to the type expected
485    // by checked_{shl,shr} (u32).
486    // It is ok to saturate the value to u32::MAX because any value
487    // above BITS - 1 will produce the same result.
488    let shift = u32::try_from(extract_first_u64(ecx, right)?).unwrap_or(u32::MAX);
489
490    for i in 0..dest_len {
491        let left = ecx.read_scalar(&ecx.project_index(&left, i)?)?;
492        let dest = ecx.project_index(&dest, i)?;
493
494        let res = match which {
495            ShiftOp::Left => {
496                let left = left.to_uint(dest.layout.size)?;
497                let res = left.checked_shl(shift).unwrap_or(0);
498                // `truncate` is needed as left-shift can make the absolute value larger.
499                Scalar::from_uint(dest.layout.size.truncate(res), dest.layout.size)
500            }
501            ShiftOp::RightLogic => {
502                let left = left.to_uint(dest.layout.size)?;
503                let res = left.checked_shr(shift).unwrap_or(0);
504                // No `truncate` needed as right-shift can only make the absolute value smaller.
505                Scalar::from_uint(res, dest.layout.size)
506            }
507            ShiftOp::RightArith => {
508                let left = left.to_int(dest.layout.size)?;
509                // On overflow, copy the sign bit to the remaining bits
510                let res = left.checked_shr(shift).unwrap_or(left >> 127);
511                // No `truncate` needed as right-shift can only make the absolute value smaller.
512                Scalar::from_int(res, dest.layout.size)
513            }
514        };
515        ecx.write_scalar(res, &dest)?;
516    }
517
518    interp_ok(())
519}
520
521/// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
522/// the first value.
523fn extract_first_u64<'tcx>(
524    ecx: &crate::MiriInterpCx<'tcx>,
525    op: &OpTy<'tcx>,
526) -> InterpResult<'tcx, u64> {
527    // Transmute vector to `[u64; 2]`
528    let array_layout = ecx.layout_of(Ty::new_array(ecx.tcx.tcx, ecx.tcx.types.u64, 2))?;
529    let op = op.transmute(array_layout, ecx)?;
530
531    // Get the first u64 from the array
532    ecx.read_scalar(&ecx.project_index(&op, 0)?)?.to_u64()
533}
534
535// Rounds the first element of `right` according to `rounding`
536// and copies the remaining elements from `left`.
537fn round_first<'tcx, F: rustc_apfloat::Float>(
538    ecx: &mut crate::MiriInterpCx<'tcx>,
539    left: &OpTy<'tcx>,
540    right: &OpTy<'tcx>,
541    rounding: &OpTy<'tcx>,
542    dest: &MPlaceTy<'tcx>,
543) -> InterpResult<'tcx, ()> {
544    let (left, left_len) = ecx.project_to_simd(left)?;
545    let (right, right_len) = ecx.project_to_simd(right)?;
546    let (dest, dest_len) = ecx.project_to_simd(dest)?;
547
548    assert_eq!(dest_len, left_len);
549    assert_eq!(dest_len, right_len);
550
551    let rounding = rounding_from_imm(ecx.read_scalar(rounding)?.to_i32()?)?;
552
553    let op0: F = ecx.read_scalar(&ecx.project_index(&right, 0)?)?.to_float()?;
554    let res = op0.round_to_integral(rounding).value;
555    ecx.write_scalar(
556        Scalar::from_uint(res.to_bits(), Size::from_bits(F::BITS)),
557        &ecx.project_index(&dest, 0)?,
558    )?;
559
560    for i in 1..dest_len {
561        ecx.copy_op(&ecx.project_index(&left, i)?, &ecx.project_index(&dest, i)?)?;
562    }
563
564    interp_ok(())
565}
566
567// Rounds all elements of `op` according to `rounding`.
568fn round_all<'tcx, F: rustc_apfloat::Float>(
569    ecx: &mut crate::MiriInterpCx<'tcx>,
570    op: &OpTy<'tcx>,
571    rounding: &OpTy<'tcx>,
572    dest: &MPlaceTy<'tcx>,
573) -> InterpResult<'tcx, ()> {
574    let (op, op_len) = ecx.project_to_simd(op)?;
575    let (dest, dest_len) = ecx.project_to_simd(dest)?;
576
577    assert_eq!(dest_len, op_len);
578
579    let rounding = rounding_from_imm(ecx.read_scalar(rounding)?.to_i32()?)?;
580
581    for i in 0..dest_len {
582        let op: F = ecx.read_scalar(&ecx.project_index(&op, i)?)?.to_float()?;
583        let res = op.round_to_integral(rounding).value;
584        ecx.write_scalar(
585            Scalar::from_uint(res.to_bits(), Size::from_bits(F::BITS)),
586            &ecx.project_index(&dest, i)?,
587        )?;
588    }
589
590    interp_ok(())
591}
592
593/// Gets equivalent `rustc_apfloat::Round` from rounding mode immediate of
594/// `round.{ss,sd,ps,pd}` intrinsics.
595fn rounding_from_imm<'tcx>(rounding: i32) -> InterpResult<'tcx, rustc_apfloat::Round> {
596    // The fourth bit of `rounding` only affects the SSE status
597    // register, which cannot be accessed from Miri (or from Rust,
598    // for that matter), so we can ignore it.
599    match rounding & !0b1000 {
600        // When the third bit is 0, the rounding mode is determined by the
601        // first two bits.
602        0b000 => interp_ok(rustc_apfloat::Round::NearestTiesToEven),
603        0b001 => interp_ok(rustc_apfloat::Round::TowardNegative),
604        0b010 => interp_ok(rustc_apfloat::Round::TowardPositive),
605        0b011 => interp_ok(rustc_apfloat::Round::TowardZero),
606        // When the third bit is 1, the rounding mode is determined by the
607        // SSE status register. Since we do not support modifying it from
608        // Miri (or Rust), we assume it to be at its default mode (round-to-nearest).
609        0b100..=0b111 => interp_ok(rustc_apfloat::Round::NearestTiesToEven),
610        rounding => panic!("invalid rounding mode 0x{rounding:02x}"),
611    }
612}
613
614/// Converts each element of `op` from floating point to signed integer.
615///
616/// When the input value is NaN or out of range, fall back to minimum value.
617///
618/// If `op` has more elements than `dest`, extra elements are ignored. If `op`
619/// has less elements than `dest`, the rest is filled with zeros.
620fn convert_float_to_int<'tcx>(
621    ecx: &mut crate::MiriInterpCx<'tcx>,
622    op: &OpTy<'tcx>,
623    rnd: rustc_apfloat::Round,
624    dest: &MPlaceTy<'tcx>,
625) -> InterpResult<'tcx, ()> {
626    let (op, op_len) = ecx.project_to_simd(op)?;
627    let (dest, dest_len) = ecx.project_to_simd(dest)?;
628
629    // Output must be *signed* integers.
630    assert!(matches!(dest.layout.field(ecx, 0).ty.kind(), ty::Int(_)));
631
632    for i in 0..op_len.min(dest_len) {
633        let op = ecx.read_immediate(&ecx.project_index(&op, i)?)?;
634        let dest = ecx.project_index(&dest, i)?;
635
636        let res = ecx.float_to_int_checked(&op, dest.layout, rnd)?.unwrap_or_else(|| {
637            // Fallback to minimum according to SSE/AVX semantics.
638            ImmTy::from_int(dest.layout.size.signed_int_min(), dest.layout)
639        });
640        ecx.write_immediate(*res, &dest)?;
641    }
642    // Fill remainder with zeros
643    for i in op_len..dest_len {
644        let dest = ecx.project_index(&dest, i)?;
645        ecx.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
646    }
647
648    interp_ok(())
649}
650
651/// Splits `op` (which must be a SIMD vector) into 128-bit chunks.
652///
653/// Returns a tuple where:
654/// * The first element is the number of 128-bit chunks (let's call it `N`).
655/// * The second element is the number of elements per chunk (let's call it `M`).
656/// * The third element is the `op` vector split into chunks, i.e, it's
657///   type is `[[T; M]; N]` where `T` is the element type of `op`.
658fn split_simd_to_128bit_chunks<'tcx, P: Projectable<'tcx, Provenance>>(
659    ecx: &mut crate::MiriInterpCx<'tcx>,
660    op: &P,
661) -> InterpResult<'tcx, (u64, u64, P)> {
662    let simd_layout = op.layout();
663    let (simd_len, element_ty) = simd_layout.ty.simd_size_and_type(ecx.tcx.tcx);
664
665    assert_eq!(simd_layout.size.bits() % 128, 0);
666    let num_chunks = simd_layout.size.bits() / 128;
667    let items_per_chunk = simd_len.strict_div(num_chunks);
668
669    // Transmute to `[[T; items_per_chunk]; num_chunks]`
670    let chunked_layout = ecx
671        .layout_of(Ty::new_array(
672            ecx.tcx.tcx,
673            Ty::new_array(ecx.tcx.tcx, element_ty, items_per_chunk),
674            num_chunks,
675        ))
676        .unwrap();
677    let chunked_op = op.transmute(chunked_layout, ecx)?;
678
679    interp_ok((num_chunks, items_per_chunk, chunked_op))
680}
681
682/// Horizontally performs `which` operation on adjacent values of
683/// `left` and `right` SIMD vectors and stores the result in `dest`.
684/// "Horizontal" means that the i-th output element is calculated
685/// from the elements 2*i and 2*i+1 of the concatenation of `left` and
686/// `right`.
687///
688/// Each 128-bit chunk is treated independently (i.e., the value for
689/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
690/// 128-bit chunks of `left` and `right`).
691fn horizontal_bin_op<'tcx>(
692    ecx: &mut crate::MiriInterpCx<'tcx>,
693    which: mir::BinOp,
694    saturating: bool,
695    left: &OpTy<'tcx>,
696    right: &OpTy<'tcx>,
697    dest: &MPlaceTy<'tcx>,
698) -> InterpResult<'tcx, ()> {
699    assert_eq!(left.layout, dest.layout);
700    assert_eq!(right.layout, dest.layout);
701
702    let (num_chunks, items_per_chunk, left) = split_simd_to_128bit_chunks(ecx, left)?;
703    let (_, _, right) = split_simd_to_128bit_chunks(ecx, right)?;
704    let (_, _, dest) = split_simd_to_128bit_chunks(ecx, dest)?;
705
706    let middle = items_per_chunk / 2;
707    for i in 0..num_chunks {
708        let left = ecx.project_index(&left, i)?;
709        let right = ecx.project_index(&right, i)?;
710        let dest = ecx.project_index(&dest, i)?;
711
712        for j in 0..items_per_chunk {
713            // `j` is the index in `dest`
714            // `k` is the index of the 2-item chunk in `src`
715            let (k, src) = if j < middle { (j, &left) } else { (j.strict_sub(middle), &right) };
716            // `base_i` is the index of the first item of the 2-item chunk in `src`
717            let base_i = k.strict_mul(2);
718            let lhs = ecx.read_immediate(&ecx.project_index(src, base_i)?)?;
719            let rhs = ecx.read_immediate(&ecx.project_index(src, base_i.strict_add(1))?)?;
720
721            let res = if saturating {
722                Immediate::from(ecx.saturating_arith(which, &lhs, &rhs)?)
723            } else {
724                *ecx.binary_op(which, &lhs, &rhs)?
725            };
726
727            ecx.write_immediate(res, &ecx.project_index(&dest, j)?)?;
728        }
729    }
730
731    interp_ok(())
732}
733
734/// Conditionally multiplies the packed floating-point elements in
735/// `left` and `right` using the high 4 bits in `imm`, sums the calculated
736/// products (up to 4), and conditionally stores the sum in `dest` using
737/// the low 4 bits of `imm`.
738///
739/// Each 128-bit chunk is treated independently (i.e., the value for
740/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
741/// 128-bit blocks of `left` and `right`).
742fn conditional_dot_product<'tcx>(
743    ecx: &mut crate::MiriInterpCx<'tcx>,
744    left: &OpTy<'tcx>,
745    right: &OpTy<'tcx>,
746    imm: &OpTy<'tcx>,
747    dest: &MPlaceTy<'tcx>,
748) -> InterpResult<'tcx, ()> {
749    assert_eq!(left.layout, dest.layout);
750    assert_eq!(right.layout, dest.layout);
751
752    let (num_chunks, items_per_chunk, left) = split_simd_to_128bit_chunks(ecx, left)?;
753    let (_, _, right) = split_simd_to_128bit_chunks(ecx, right)?;
754    let (_, _, dest) = split_simd_to_128bit_chunks(ecx, dest)?;
755
756    let element_layout = left.layout.field(ecx, 0).field(ecx, 0);
757    assert!(items_per_chunk <= 4);
758
759    // `imm` is a `u8` for SSE4.1 or an `i32` for AVX :/
760    let imm = ecx.read_scalar(imm)?.to_uint(imm.layout.size)?;
761
762    for i in 0..num_chunks {
763        let left = ecx.project_index(&left, i)?;
764        let right = ecx.project_index(&right, i)?;
765        let dest = ecx.project_index(&dest, i)?;
766
767        // Calculate dot product
768        // Elements are floating point numbers, but we can use `from_int`
769        // for the initial value because the representation of 0.0 is all zero bits.
770        let mut sum = ImmTy::from_int(0u8, element_layout);
771        for j in 0..items_per_chunk {
772            if imm & (1 << j.strict_add(4)) != 0 {
773                let left = ecx.read_immediate(&ecx.project_index(&left, j)?)?;
774                let right = ecx.read_immediate(&ecx.project_index(&right, j)?)?;
775
776                let mul = ecx.binary_op(mir::BinOp::Mul, &left, &right)?;
777                sum = ecx.binary_op(mir::BinOp::Add, &sum, &mul)?;
778            }
779        }
780
781        // Write to destination (conditioned to imm)
782        for j in 0..items_per_chunk {
783            let dest = ecx.project_index(&dest, j)?;
784
785            if imm & (1 << j) != 0 {
786                ecx.write_immediate(*sum, &dest)?;
787            } else {
788                ecx.write_scalar(Scalar::from_int(0u8, element_layout.size), &dest)?;
789            }
790        }
791    }
792
793    interp_ok(())
794}
795
796/// Calculates two booleans.
797///
798/// The first is true when all the bits of `op & mask` are zero.
799/// The second is true when `(op & mask) == mask`
800fn test_bits_masked<'tcx>(
801    ecx: &crate::MiriInterpCx<'tcx>,
802    op: &OpTy<'tcx>,
803    mask: &OpTy<'tcx>,
804) -> InterpResult<'tcx, (bool, bool)> {
805    assert_eq!(op.layout, mask.layout);
806
807    let (op, op_len) = ecx.project_to_simd(op)?;
808    let (mask, mask_len) = ecx.project_to_simd(mask)?;
809
810    assert_eq!(op_len, mask_len);
811
812    let mut all_zero = true;
813    let mut masked_set = true;
814    for i in 0..op_len {
815        let op = ecx.project_index(&op, i)?;
816        let mask = ecx.project_index(&mask, i)?;
817
818        let op = ecx.read_scalar(&op)?.to_uint(op.layout.size)?;
819        let mask = ecx.read_scalar(&mask)?.to_uint(mask.layout.size)?;
820        all_zero &= (op & mask) == 0;
821        masked_set &= (op & mask) == mask;
822    }
823
824    interp_ok((all_zero, masked_set))
825}
826
827/// Calculates two booleans.
828///
829/// The first is true when the highest bit of each element of `op & mask` is zero.
830/// The second is true when the highest bit of each element of `!op & mask` is zero.
831fn test_high_bits_masked<'tcx>(
832    ecx: &crate::MiriInterpCx<'tcx>,
833    op: &OpTy<'tcx>,
834    mask: &OpTy<'tcx>,
835) -> InterpResult<'tcx, (bool, bool)> {
836    assert_eq!(op.layout, mask.layout);
837
838    let (op, op_len) = ecx.project_to_simd(op)?;
839    let (mask, mask_len) = ecx.project_to_simd(mask)?;
840
841    assert_eq!(op_len, mask_len);
842
843    let high_bit_offset = op.layout.field(ecx, 0).size.bits().strict_sub(1);
844
845    let mut direct = true;
846    let mut negated = true;
847    for i in 0..op_len {
848        let op = ecx.project_index(&op, i)?;
849        let mask = ecx.project_index(&mask, i)?;
850
851        let op = ecx.read_scalar(&op)?.to_uint(op.layout.size)?;
852        let mask = ecx.read_scalar(&mask)?.to_uint(mask.layout.size)?;
853        direct &= (op & mask) >> high_bit_offset == 0;
854        negated &= (!op & mask) >> high_bit_offset == 0;
855    }
856
857    interp_ok((direct, negated))
858}
859
860/// Compute the sum of absolute differences of quadruplets of unsigned
861/// 8-bit integers in `left` and `right`, and store the 16-bit results
862/// in `right`. Quadruplets are selected from `left` and `right` with
863/// offsets specified in `imm`.
864///
865/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16>
866/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8>
867///
868/// Each 128-bit chunk is treated independently (i.e., the value for
869/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
870/// 128-bit chunks of `left` and `right`).
871fn mpsadbw<'tcx>(
872    ecx: &mut crate::MiriInterpCx<'tcx>,
873    left: &OpTy<'tcx>,
874    right: &OpTy<'tcx>,
875    imm: &OpTy<'tcx>,
876    dest: &MPlaceTy<'tcx>,
877) -> InterpResult<'tcx, ()> {
878    assert_eq!(left.layout, right.layout);
879    assert_eq!(left.layout.size, dest.layout.size);
880
881    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(ecx, left)?;
882    assert!(num_chunks <= 2);
883
884    let (_, _, right) = split_simd_to_128bit_chunks(ecx, right)?;
885    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(ecx, dest)?;
886
887    assert_eq!(op_items_per_chunk, dest_items_per_chunk.strict_mul(2));
888
889    let imm = ecx.read_scalar(imm)?.to_uint(imm.layout.size)?;
890
891    for i in 0..num_chunks {
892        let left = ecx.project_index(&left, i)?;
893        let right = ecx.project_index(&right, i)?;
894        let dest = ecx.project_index(&dest, i)?;
895
896        // The first 128-bit chunk uses the low 3 bits of IMM, the second chunk uses bits 3..6.
897        let lane_imm = imm.strict_shr(i.strict_mul(3).try_into().unwrap());
898
899        // Bit 2 of `lane_imm` specifies the offset for indices of `left`.
900        // The offset is 0 when the bit is 0 or 4 when the bit is 1.
901        let left_base = u64::try_from((lane_imm >> 2) & 1).unwrap().strict_mul(4);
902        // Bits 0..=1 of `lane_imm` specify the offset for indices of
903        // `right` in blocks of 4 elements.
904        let right_base = u64::try_from(lane_imm & 0b11).unwrap().strict_mul(4);
905
906        for j in 0..dest_items_per_chunk {
907            let left_offset = left_base.strict_add(j);
908            let mut res: u16 = 0;
909            for k in 0..4 {
910                let left = ecx
911                    .read_scalar(&ecx.project_index(&left, left_offset.strict_add(k))?)?
912                    .to_u8()?;
913                let right = ecx
914                    .read_scalar(&ecx.project_index(&right, right_base.strict_add(k))?)?
915                    .to_u8()?;
916                res = res.strict_add(left.abs_diff(right).into());
917            }
918            ecx.write_scalar(Scalar::from_u16(res), &ecx.project_index(&dest, j)?)?;
919        }
920    }
921
922    interp_ok(())
923}
924
925/// Compute the absolute differences of packed unsigned 8-bit integers
926/// in `left` and `right`, then horizontally sum each consecutive 8
927/// differences to produce unsigned 16-bit integers, and pack
928/// these unsigned 16-bit integers in the low 16 bits of 64-bit elements
929/// in `dest`.
930///
931/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8>
932/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8>
933/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_sad_epu8>
934fn psadbw<'tcx>(
935    ecx: &mut crate::MiriInterpCx<'tcx>,
936    left: &OpTy<'tcx>,
937    right: &OpTy<'tcx>,
938    dest: &MPlaceTy<'tcx>,
939) -> InterpResult<'tcx, ()> {
940    let (left, left_len) = ecx.project_to_simd(left)?;
941    let (right, right_len) = ecx.project_to_simd(right)?;
942    let (dest, dest_len) = ecx.project_to_simd(dest)?;
943
944    // fn psadbw(a: u8x16, b: u8x16) -> u64x2;
945    // fn psadbw(a: u8x32, b: u8x32) -> u64x4;
946    // fn vpsadbw(a: u8x64, b: u8x64) -> u64x8;
947    assert_eq!(left_len, right_len);
948    assert_eq!(left_len, left.layout.layout.size().bytes());
949    assert_eq!(dest_len, left_len.strict_div(8));
950
951    for i in 0..dest_len {
952        let dest = ecx.project_index(&dest, i)?;
953
954        let mut acc: u16 = 0;
955        for j in 0..8 {
956            let src_index = i.strict_mul(8).strict_add(j);
957
958            let left = ecx.project_index(&left, src_index)?;
959            let left = ecx.read_scalar(&left)?.to_u8()?;
960
961            let right = ecx.project_index(&right, src_index)?;
962            let right = ecx.read_scalar(&right)?.to_u8()?;
963
964            acc = acc.strict_add(left.abs_diff(right).into());
965        }
966
967        ecx.write_scalar(Scalar::from_u64(acc.into()), &dest)?;
968    }
969
970    interp_ok(())
971}
972
973/// Multiply packed signed 16-bit integers in `left` and `right`, producing intermediate signed 32-bit integers.
974/// Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in `dest`.
975///
976/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16>
977/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16>
978/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_madd_epi16>
979fn pmaddwd<'tcx>(
980    ecx: &mut crate::MiriInterpCx<'tcx>,
981    left: &OpTy<'tcx>,
982    right: &OpTy<'tcx>,
983    dest: &MPlaceTy<'tcx>,
984) -> InterpResult<'tcx, ()> {
985    let (left, left_len) = ecx.project_to_simd(left)?;
986    let (right, right_len) = ecx.project_to_simd(right)?;
987    let (dest, dest_len) = ecx.project_to_simd(dest)?;
988
989    // fn  pmaddwd(a: i16x8,  b: i16x8)  -> i32x4;
990    // fn  pmaddwd(a: i16x16, b: i16x16) -> i32x8;
991    // fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
992    assert_eq!(left_len, right_len);
993    assert_eq!(dest_len.strict_mul(2), left_len);
994
995    for i in 0..dest_len {
996        let j1 = i.strict_mul(2);
997        let left1 = ecx.read_scalar(&ecx.project_index(&left, j1)?)?.to_i16()?;
998        let right1 = ecx.read_scalar(&ecx.project_index(&right, j1)?)?.to_i16()?;
999
1000        let j2 = j1.strict_add(1);
1001        let left2 = ecx.read_scalar(&ecx.project_index(&left, j2)?)?.to_i16()?;
1002        let right2 = ecx.read_scalar(&ecx.project_index(&right, j2)?)?.to_i16()?;
1003
1004        let dest = ecx.project_index(&dest, i)?;
1005
1006        // Multiplications are i16*i16->i32, which will not overflow.
1007        let mul1 = i32::from(left1).strict_mul(right1.into());
1008        let mul2 = i32::from(left2).strict_mul(right2.into());
1009        // However, this addition can overflow in the most extreme case
1010        // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
1011        let res = mul1.wrapping_add(mul2);
1012
1013        ecx.write_scalar(Scalar::from_i32(res), &dest)?;
1014    }
1015
1016    interp_ok(())
1017}
1018
1019/// Multiplies packed 8-bit unsigned integers from `left` and packed
1020/// signed 8-bit integers from `right` into 16-bit signed integers. Then,
1021/// the saturating sum of the products with indices `2*i` and `2*i+1`
1022/// produces the output at index `i`.
1023///
1024/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16>
1025/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16>
1026/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maddubs_epi16>
1027fn pmaddbw<'tcx>(
1028    ecx: &mut crate::MiriInterpCx<'tcx>,
1029    left: &OpTy<'tcx>,
1030    right: &OpTy<'tcx>,
1031    dest: &MPlaceTy<'tcx>,
1032) -> InterpResult<'tcx, ()> {
1033    let (left, left_len) = ecx.project_to_simd(left)?;
1034    let (right, right_len) = ecx.project_to_simd(right)?;
1035    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1036
1037    // fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;
1038    // fn pmaddubsw(   a: u8x32, b: i8x32) -> i16x16;
1039    // fn vpmaddubsw(  a: u8x64, b: i8x64) -> i16x32;
1040    assert_eq!(left_len, right_len);
1041    assert_eq!(dest_len.strict_mul(2), left_len);
1042
1043    for i in 0..dest_len {
1044        let j1 = i.strict_mul(2);
1045        let left1 = ecx.read_scalar(&ecx.project_index(&left, j1)?)?.to_u8()?;
1046        let right1 = ecx.read_scalar(&ecx.project_index(&right, j1)?)?.to_i8()?;
1047
1048        let j2 = j1.strict_add(1);
1049        let left2 = ecx.read_scalar(&ecx.project_index(&left, j2)?)?.to_u8()?;
1050        let right2 = ecx.read_scalar(&ecx.project_index(&right, j2)?)?.to_i8()?;
1051
1052        let dest = ecx.project_index(&dest, i)?;
1053
1054        // Multiplication of a u8 and an i8 into an i16 cannot overflow.
1055        let mul1 = i16::from(left1).strict_mul(right1.into());
1056        let mul2 = i16::from(left2).strict_mul(right2.into());
1057        let res = mul1.saturating_add(mul2);
1058
1059        ecx.write_scalar(Scalar::from_i16(res), &dest)?;
1060    }
1061
1062    interp_ok(())
1063}
1064
1065/// Shuffle elements in `values` across lanes using the corresponding index in
1066/// `indices`, and store the results in `dest`.
1067///
1068/// This helper is shared by both the 32-bit-lane and 64-bit-lane AVX
1069/// permute-by-index intrinsics. The element type is taken from `values` and
1070/// `dest`, while the index lanes are interpreted at their full width (`i32` or
1071/// `i64`, depending on the intrinsic).
1072///
1073/// For a vector with `N` lanes, only the low `log2(N)` bits of each index are
1074/// used. Equivalently, lane `i` of the result is copied from
1075/// `values[indices[i] & (N - 1)]`.
1076///
1077/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32>
1078/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps>
1079/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi32>
1080/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutexvar_epi64>
1081fn permute<'tcx>(
1082    ecx: &mut crate::MiriInterpCx<'tcx>,
1083    values: &OpTy<'tcx>,
1084    indices: &OpTy<'tcx>,
1085    dest: &MPlaceTy<'tcx>,
1086) -> InterpResult<'tcx, ()> {
1087    let (values, values_len) = ecx.project_to_simd(values)?;
1088    let (indices, indices_len) = ecx.project_to_simd(indices)?;
1089    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1090
1091    // fn permd(a: u32x8, b: u32x8) -> u32x8;
1092    // fn permps(a: __m256, b: i32x8) -> __m256;
1093    // fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
1094    // fn vpermq(a: i64x8, b: i64x8) -> i64x8;
1095    assert_eq!(dest_len, values_len);
1096    assert_eq!(dest_len, indices_len);
1097
1098    // Only use the lower 3 bits to index into a vector with 8 lanes,
1099    // or the lower 4 bits when indexing into a 16-lane vector.
1100    assert!(dest_len.is_power_of_two());
1101    let mask = u128::from(dest_len).strict_sub(1);
1102
1103    for i in 0..dest_len {
1104        let dest = ecx.project_index(&dest, i)?;
1105        let index_place = ecx.project_index(&indices, i)?;
1106        let index = ecx.read_scalar(&index_place)?.to_uint(index_place.layout.size)?;
1107        // `mask` is at most `dest_len - 1` which fits in a `u64`, so this cannot fail.
1108        let element = ecx.project_index(&values, u64::try_from(index & mask).unwrap())?;
1109
1110        ecx.copy_op(&element, &dest)?;
1111    }
1112
1113    interp_ok(())
1114}
1115
1116/// Shuffle elements from *two* source registers (`left` and `right`) using
1117/// the corresponding index in `indices`, and store the results in `dest`.
1118///
1119/// For a vector with `N` lanes, the low `log2(N)` bits of each index select a
1120/// lane within a source vector. Bit `log2(N)` selects the source vector (`0` =>
1121/// `left`, `1` => `right`), and all higher bits are ignored.
1122/// Equivalently, lane `i` of the result is copied from
1123/// `src[indices[i] & (N - 1)]` where
1124/// `src = if indices[i] & N == 0 { left } else { right }`.
1125///
1126/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi64>
1127/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permutex2var_epi8>
1128fn permute2<'tcx>(
1129    ecx: &mut crate::MiriInterpCx<'tcx>,
1130    left: &OpTy<'tcx>,
1131    indices: &OpTy<'tcx>,
1132    right: &OpTy<'tcx>,
1133    dest: &MPlaceTy<'tcx>,
1134) -> InterpResult<'tcx, ()> {
1135    let (left, left_len) = ecx.project_to_simd(left)?;
1136    let (indices, indices_len) = ecx.project_to_simd(indices)?;
1137    let (right, right_len) = ecx.project_to_simd(right)?;
1138    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1139
1140    assert_eq!(dest_len, left_len);
1141    assert_eq!(dest_len, indices_len);
1142    assert_eq!(dest_len, right_len);
1143
1144    // Use the low bits to select a lane within either input vector, and the next bit to
1145    // choose between the two vectors.
1146    assert!(dest_len.is_power_of_two());
1147    let lane_mask = u128::from(dest_len).strict_sub(1);
1148    let vector_select_bit = u128::from(dest_len);
1149
1150    for i in 0..dest_len {
1151        let dest = ecx.project_index(&dest, i)?;
1152        let index_place = ecx.project_index(&indices, i)?;
1153        let index = ecx.read_scalar(&index_place)?.to_uint(index_place.layout.size)?;
1154        // `lane_mask` is at most `dest_len - 1` which fits in a `u64`, so this cannot fail.
1155        let lane = u64::try_from(index & lane_mask).unwrap();
1156        let src = if index & vector_select_bit == 0 { &left } else { &right };
1157        let element = ecx.project_index(src, lane)?;
1158
1159        ecx.copy_op(&element, &dest)?;
1160    }
1161
1162    interp_ok(())
1163}
1164
1165/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
1166/// product to the 18 most significant bits by right-shifting, and then
1167/// divides the 18-bit value by 2 (rounding to nearest) by first adding
1168/// 1 and then taking the bits `1..=16`.
1169///
1170/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16>
1171/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16>
1172fn pmulhrsw<'tcx>(
1173    ecx: &mut crate::MiriInterpCx<'tcx>,
1174    left: &OpTy<'tcx>,
1175    right: &OpTy<'tcx>,
1176    dest: &MPlaceTy<'tcx>,
1177) -> InterpResult<'tcx, ()> {
1178    let (left, left_len) = ecx.project_to_simd(left)?;
1179    let (right, right_len) = ecx.project_to_simd(right)?;
1180    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1181
1182    assert_eq!(dest_len, left_len);
1183    assert_eq!(dest_len, right_len);
1184
1185    for i in 0..dest_len {
1186        let left = ecx.read_scalar(&ecx.project_index(&left, i)?)?.to_i16()?;
1187        let right = ecx.read_scalar(&ecx.project_index(&right, i)?)?.to_i16()?;
1188        let dest = ecx.project_index(&dest, i)?;
1189
1190        let res = (i32::from(left).strict_mul(right.into()) >> 14).strict_add(1) >> 1;
1191
1192        // The result of this operation can overflow a signed 16-bit integer.
1193        // When `left` and `right` are -0x8000, the result is 0x8000.
1194        #[expect(clippy::as_conversions)]
1195        let res = res as i16;
1196
1197        ecx.write_scalar(Scalar::from_i16(res), &dest)?;
1198    }
1199
1200    interp_ok(())
1201}
1202
1203/// Perform a carry-less multiplication of two 64-bit integers, selected from `left` and `right` according to `imm8`,
1204/// and store the results in `dst`.
1205///
1206/// `left` and `right` are both vectors of type `len` x i64. Only bits 0 and 4 of `imm8` matter;
1207/// they select the element of `left` and `right`, respectively.
1208///
1209/// `len` is the SIMD vector length (in counts of `i64` values). It is expected to be one of
1210/// `2`, `4`, or `8`.
1211///
1212/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128>
1213fn pclmulqdq<'tcx>(
1214    ecx: &mut MiriInterpCx<'tcx>,
1215    left: &OpTy<'tcx>,
1216    right: &OpTy<'tcx>,
1217    imm8: &OpTy<'tcx>,
1218    dest: &MPlaceTy<'tcx>,
1219    len: u64,
1220) -> InterpResult<'tcx, ()> {
1221    assert_eq!(left.layout, right.layout);
1222    assert_eq!(left.layout.size, dest.layout.size);
1223    assert!([2u64, 4, 8].contains(&len));
1224
1225    // Transmute the input into arrays of `[u64; len]`.
1226    // Transmute the output into an array of `[u128, len / 2]`.
1227
1228    let src_layout = ecx.layout_of(Ty::new_array(ecx.tcx.tcx, ecx.tcx.types.u64, len))?;
1229    let dest_layout = ecx.layout_of(Ty::new_array(ecx.tcx.tcx, ecx.tcx.types.u128, len / 2))?;
1230
1231    let left = left.transmute(src_layout, ecx)?;
1232    let right = right.transmute(src_layout, ecx)?;
1233    let dest = dest.transmute(dest_layout, ecx)?;
1234
1235    let imm8 = ecx.read_scalar(imm8)?.to_u8()?;
1236
1237    for i in 0..(len / 2) {
1238        let lo = i.strict_mul(2);
1239        let hi = i.strict_mul(2).strict_add(1);
1240
1241        // select the 64-bit integer from left that the user specified (low or high)
1242        let index = if (imm8 & 0x01) == 0 { lo } else { hi };
1243        let left = ecx.read_scalar(&ecx.project_index(&left, index)?)?.to_u64()?;
1244
1245        // select the 64-bit integer from right that the user specified (low or high)
1246        let index = if (imm8 & 0x10) == 0 { lo } else { hi };
1247        let right = ecx.read_scalar(&ecx.project_index(&right, index)?)?.to_u64()?;
1248
1249        let result = left.widening_carryless_mul(right);
1250
1251        let dest = ecx.project_index(&dest, i)?;
1252        ecx.write_scalar(Scalar::from_u128(result), &dest)?;
1253    }
1254
1255    interp_ok(())
1256}
1257
1258/// Shuffles bytes from `left` using `right` as pattern. Each 16-byte block is shuffled independently.
1259///
1260/// `left` and `right` are both vectors of type `len` x i8.
1261///
1262/// If the highest bit of a byte in `right` is not set, the corresponding byte in `dest` is taken
1263/// from the current 16-byte block of `left` at the position indicated by the lowest 4 bits of this
1264/// byte in `right`. If the highest bit of a byte in `right` is set, the corresponding byte in
1265/// `dest` is set to `0`.
1266///
1267/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8>
1268/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8>
1269/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8>
1270fn pshufb<'tcx>(
1271    ecx: &mut crate::MiriInterpCx<'tcx>,
1272    left: &OpTy<'tcx>,
1273    right: &OpTy<'tcx>,
1274    dest: &MPlaceTy<'tcx>,
1275) -> InterpResult<'tcx, ()> {
1276    let (left, left_len) = ecx.project_to_simd(left)?;
1277    let (right, right_len) = ecx.project_to_simd(right)?;
1278    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1279
1280    assert_eq!(dest_len, left_len);
1281    assert_eq!(dest_len, right_len);
1282
1283    for i in 0..dest_len {
1284        let right = ecx.read_scalar(&ecx.project_index(&right, i)?)?.to_u8()?;
1285        let dest = ecx.project_index(&dest, i)?;
1286
1287        let res = if right & 0x80 == 0 {
1288            // Shuffle each 128-bit (16-byte) block independently.
1289            let block_offset = i & !15; // round down to previous multiple of 16
1290            let j = block_offset.strict_add((right % 16).into());
1291            ecx.read_scalar(&ecx.project_index(&left, j)?)?
1292        } else {
1293            // If the highest bit in `right` is 1, write zero.
1294            Scalar::from_u8(0)
1295        };
1296
1297        ecx.write_scalar(res, &dest)?;
1298    }
1299
1300    interp_ok(())
1301}
1302
1303/// Packs two N-bit integer vectors to a single N/2-bit integers.
1304///
1305/// The conversion from N-bit to N/2-bit should be provided by `f`.
1306///
1307/// Each 128-bit chunk is treated independently (i.e., the value for
1308/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
1309/// 128-bit chunks of `left` and `right`).
1310fn pack_generic<'tcx>(
1311    ecx: &mut crate::MiriInterpCx<'tcx>,
1312    left: &OpTy<'tcx>,
1313    right: &OpTy<'tcx>,
1314    dest: &MPlaceTy<'tcx>,
1315    f: impl Fn(Scalar) -> InterpResult<'tcx, Scalar>,
1316) -> InterpResult<'tcx, ()> {
1317    assert_eq!(left.layout, right.layout);
1318    assert_eq!(left.layout.size, dest.layout.size);
1319
1320    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(ecx, left)?;
1321    let (_, _, right) = split_simd_to_128bit_chunks(ecx, right)?;
1322    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(ecx, dest)?;
1323
1324    assert_eq!(dest_items_per_chunk, op_items_per_chunk.strict_mul(2));
1325
1326    for i in 0..num_chunks {
1327        let left = ecx.project_index(&left, i)?;
1328        let right = ecx.project_index(&right, i)?;
1329        let dest = ecx.project_index(&dest, i)?;
1330
1331        for j in 0..op_items_per_chunk {
1332            let left = ecx.read_scalar(&ecx.project_index(&left, j)?)?;
1333            let right = ecx.read_scalar(&ecx.project_index(&right, j)?)?;
1334            let left_dest = ecx.project_index(&dest, j)?;
1335            let right_dest = ecx.project_index(&dest, j.strict_add(op_items_per_chunk))?;
1336
1337            let left_res = f(left)?;
1338            let right_res = f(right)?;
1339
1340            ecx.write_scalar(left_res, &left_dest)?;
1341            ecx.write_scalar(right_res, &right_dest)?;
1342        }
1343    }
1344
1345    interp_ok(())
1346}
1347
1348/// Converts two 16-bit integer vectors to a single 8-bit integer
1349/// vector with signed saturation.
1350///
1351/// Each 128-bit chunk is treated independently (i.e., the value for
1352/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
1353/// 128-bit chunks of `left` and `right`).
1354fn packsswb<'tcx>(
1355    ecx: &mut crate::MiriInterpCx<'tcx>,
1356    left: &OpTy<'tcx>,
1357    right: &OpTy<'tcx>,
1358    dest: &MPlaceTy<'tcx>,
1359) -> InterpResult<'tcx, ()> {
1360    pack_generic(ecx, left, right, dest, |op| {
1361        let op = op.to_i16()?;
1362        let res = i8::try_from(op).unwrap_or(if op < 0 { i8::MIN } else { i8::MAX });
1363        interp_ok(Scalar::from_i8(res))
1364    })
1365}
1366
1367/// Converts two 16-bit signed integer vectors to a single 8-bit
1368/// unsigned integer vector with saturation.
1369///
1370/// Each 128-bit chunk is treated independently (i.e., the value for
1371/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
1372/// 128-bit chunks of `left` and `right`).
1373fn packuswb<'tcx>(
1374    ecx: &mut crate::MiriInterpCx<'tcx>,
1375    left: &OpTy<'tcx>,
1376    right: &OpTy<'tcx>,
1377    dest: &MPlaceTy<'tcx>,
1378) -> InterpResult<'tcx, ()> {
1379    pack_generic(ecx, left, right, dest, |op| {
1380        let op = op.to_i16()?;
1381        let res = u8::try_from(op).unwrap_or(if op < 0 { 0 } else { u8::MAX });
1382        interp_ok(Scalar::from_u8(res))
1383    })
1384}
1385
1386/// Converts two 32-bit integer vectors to a single 16-bit integer
1387/// vector with signed saturation.
1388///
1389/// Each 128-bit chunk is treated independently (i.e., the value for
1390/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
1391/// 128-bit chunks of `left` and `right`).
1392fn packssdw<'tcx>(
1393    ecx: &mut crate::MiriInterpCx<'tcx>,
1394    left: &OpTy<'tcx>,
1395    right: &OpTy<'tcx>,
1396    dest: &MPlaceTy<'tcx>,
1397) -> InterpResult<'tcx, ()> {
1398    pack_generic(ecx, left, right, dest, |op| {
1399        let op = op.to_i32()?;
1400        let res = i16::try_from(op).unwrap_or(if op < 0 { i16::MIN } else { i16::MAX });
1401        interp_ok(Scalar::from_i16(res))
1402    })
1403}
1404
1405/// Converts two 32-bit integer vectors to a single 16-bit integer
1406/// vector with unsigned saturation.
1407///
1408/// Each 128-bit chunk is treated independently (i.e., the value for
1409/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
1410/// 128-bit chunks of `left` and `right`).
1411fn packusdw<'tcx>(
1412    ecx: &mut crate::MiriInterpCx<'tcx>,
1413    left: &OpTy<'tcx>,
1414    right: &OpTy<'tcx>,
1415    dest: &MPlaceTy<'tcx>,
1416) -> InterpResult<'tcx, ()> {
1417    pack_generic(ecx, left, right, dest, |op| {
1418        let op = op.to_i32()?;
1419        let res = u16::try_from(op).unwrap_or(if op < 0 { 0 } else { u16::MAX });
1420        interp_ok(Scalar::from_u16(res))
1421    })
1422}
1423
1424/// Negates elements from `left` when the corresponding element in
1425/// `right` is negative. If an element from `right` is zero, zero
1426/// is written to the corresponding output element.
1427/// In other words, multiplies `left` with `right.signum()`.
1428fn psign<'tcx>(
1429    ecx: &mut crate::MiriInterpCx<'tcx>,
1430    left: &OpTy<'tcx>,
1431    right: &OpTy<'tcx>,
1432    dest: &MPlaceTy<'tcx>,
1433) -> InterpResult<'tcx, ()> {
1434    let (left, left_len) = ecx.project_to_simd(left)?;
1435    let (right, right_len) = ecx.project_to_simd(right)?;
1436    let (dest, dest_len) = ecx.project_to_simd(dest)?;
1437
1438    assert_eq!(dest_len, left_len);
1439    assert_eq!(dest_len, right_len);
1440
1441    for i in 0..dest_len {
1442        let dest = ecx.project_index(&dest, i)?;
1443        let left = ecx.read_immediate(&ecx.project_index(&left, i)?)?;
1444        let right = ecx.read_scalar(&ecx.project_index(&right, i)?)?.to_int(dest.layout.size)?;
1445
1446        let res =
1447            ecx.binary_op(mir::BinOp::Mul, &left, &ImmTy::from_int(right.signum(), dest.layout))?;
1448
1449        ecx.write_immediate(*res, &dest)?;
1450    }
1451
1452    interp_ok(())
1453}
1454
1455/// Calcultates either `a + b + cb_in` or `a - b - cb_in` depending on the value
1456/// of `op` and returns both the sum and the overflow bit. `op` is expected to be
1457/// either one of `mir::BinOp::AddWithOverflow` and `mir::BinOp::SubWithOverflow`.
1458fn carrying_add<'tcx>(
1459    ecx: &mut crate::MiriInterpCx<'tcx>,
1460    cb_in: &OpTy<'tcx>,
1461    a: &OpTy<'tcx>,
1462    b: &OpTy<'tcx>,
1463    op: mir::BinOp,
1464) -> InterpResult<'tcx, (ImmTy<'tcx>, Scalar)> {
1465    assert!(op == mir::BinOp::AddWithOverflow || op == mir::BinOp::SubWithOverflow);
1466
1467    let cb_in = ecx.read_scalar(cb_in)?.to_u8()? != 0;
1468    let a = ecx.read_immediate(a)?;
1469    let b = ecx.read_immediate(b)?;
1470
1471    let (sum, overflow1) = ecx.binary_op(op, &a, &b)?.to_pair(ecx);
1472    let (sum, overflow2) =
1473        ecx.binary_op(op, &sum, &ImmTy::from_uint(cb_in, a.layout))?.to_pair(ecx);
1474    let cb_out = overflow1.to_scalar().to_bool()? | overflow2.to_scalar().to_bool()?;
1475
1476    interp_ok((sum, Scalar::from_u8(cb_out.into())))
1477}