miri/shims/unix/linux_like/
sync.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
use crate::concurrency::sync::FutexRef;
use crate::helpers::check_min_arg_count;
use crate::*;

struct LinuxFutex {
    futex: FutexRef,
}

/// Implementation of the SYS_futex syscall.
/// `args` is the arguments *including* the syscall number.
pub fn futex<'tcx>(
    ecx: &mut MiriInterpCx<'tcx>,
    args: &[OpTy<'tcx>],
    dest: &MPlaceTy<'tcx>,
) -> InterpResult<'tcx> {
    // The amount of arguments used depends on the type of futex operation.
    // The full futex syscall takes six arguments (excluding the syscall
    // number), which is also the maximum amount of arguments a linux syscall
    // can take on most architectures.
    // However, not all futex operations use all six arguments. The unused ones
    // may or may not be left out from the `syscall()` call.
    // Therefore we don't use `check_arg_count` here, but only check for the
    // number of arguments to fall within a range.
    let [_, addr, op, val] = check_min_arg_count("`syscall(SYS_futex, ...)`", args)?;

    // The first three arguments (after the syscall number itself) are the same to all futex operations:
    //     (int *addr, int op, int val).
    // We checked above that these definitely exist.
    let addr = ecx.read_pointer(addr)?;
    let op = ecx.read_scalar(op)?.to_i32()?;
    let val = ecx.read_scalar(val)?.to_i32()?;

    // This is a vararg function so we have to bring our own type for this pointer.
    let addr = ecx.ptr_to_mplace(addr, ecx.machine.layouts.i32);

    let futex_private = ecx.eval_libc_i32("FUTEX_PRIVATE_FLAG");
    let futex_wait = ecx.eval_libc_i32("FUTEX_WAIT");
    let futex_wait_bitset = ecx.eval_libc_i32("FUTEX_WAIT_BITSET");
    let futex_wake = ecx.eval_libc_i32("FUTEX_WAKE");
    let futex_wake_bitset = ecx.eval_libc_i32("FUTEX_WAKE_BITSET");
    let futex_realtime = ecx.eval_libc_i32("FUTEX_CLOCK_REALTIME");

    // FUTEX_PRIVATE enables an optimization that stops it from working across processes.
    // Miri doesn't support that anyway, so we ignore that flag.
    match op & !futex_private {
        // FUTEX_WAIT: (int *addr, int op = FUTEX_WAIT, int val, const timespec *timeout)
        // Blocks the thread if *addr still equals val. Wakes up when FUTEX_WAKE is called on the same address,
        // or *timeout expires. `timeout == null` for an infinite timeout.
        //
        // FUTEX_WAIT_BITSET: (int *addr, int op = FUTEX_WAIT_BITSET, int val, const timespec *timeout, int *_ignored, unsigned int bitset)
        // This is identical to FUTEX_WAIT, except:
        //  - The timeout is absolute rather than relative.
        //  - You can specify the bitset to selecting what WAKE operations to respond to.
        op if op & !futex_realtime == futex_wait || op & !futex_realtime == futex_wait_bitset => {
            let wait_bitset = op & !futex_realtime == futex_wait_bitset;

            let (timeout, bitset) = if wait_bitset {
                let [_, _, _, _, timeout, uaddr2, bitset] =
                    check_min_arg_count("`syscall(SYS_futex, FUTEX_WAIT_BITSET, ...)`", args)?;
                let _timeout = ecx.read_pointer(timeout)?;
                let _uaddr2 = ecx.read_pointer(uaddr2)?;
                (timeout, ecx.read_scalar(bitset)?.to_u32()?)
            } else {
                let [_, _, _, _, timeout] =
                    check_min_arg_count("`syscall(SYS_futex, FUTEX_WAIT, ...)`", args)?;
                (timeout, u32::MAX)
            };

            if bitset == 0 {
                return ecx.set_last_error_and_return(LibcError("EINVAL"), dest);
            }

            let timeout = ecx.deref_pointer_as(timeout, ecx.libc_ty_layout("timespec"))?;
            let timeout = if ecx.ptr_is_null(timeout.ptr())? {
                None
            } else {
                let duration = match ecx.read_timespec(&timeout)? {
                    Some(duration) => duration,
                    None => {
                        return ecx.set_last_error_and_return(LibcError("EINVAL"), dest);
                    }
                };
                let timeout_clock = if op & futex_realtime == futex_realtime {
                    ecx.check_no_isolation(
                        "`futex` syscall with `op=FUTEX_WAIT` and non-null timeout with `FUTEX_CLOCK_REALTIME`",
                    )?;
                    TimeoutClock::RealTime
                } else {
                    TimeoutClock::Monotonic
                };
                let timeout_anchor = if wait_bitset {
                    // FUTEX_WAIT_BITSET uses an absolute timestamp.
                    TimeoutAnchor::Absolute
                } else {
                    // FUTEX_WAIT uses a relative timestamp.
                    TimeoutAnchor::Relative
                };
                Some((timeout_clock, timeout_anchor, duration))
            };
            // There may be a concurrent thread changing the value of addr
            // and then invoking the FUTEX_WAKE syscall. It is critical that the
            // effects of this and the other thread are correctly observed,
            // otherwise we will deadlock.
            //
            // There are two scenarios to consider, depending on whether WAIT or WAKE goes first:
            // 1. If we (FUTEX_WAIT) execute first, we'll push ourselves into the waiters queue and
            //    go to sleep. They (FUTEX_WAKE) will see us in the queue and wake us up. It doesn't
            //    matter how the addr write is ordered.
            // 2. If they (FUTEX_WAKE) execute first, that means the addr write is also before us
            //    (FUTEX_WAIT). It is crucial that we observe addr's new value. If we see an
            //    outdated value that happens to equal the expected val, then we'll put ourselves to
            //    sleep with no one to wake us up, so we end up with a deadlock. This is prevented
            //    by having a SeqCst fence inside FUTEX_WAKE syscall, and another SeqCst fence here
            //    in FUTEX_WAIT. The atomic read on addr after the SeqCst fence is guaranteed not to
            //    see any value older than the addr write immediately before calling FUTEX_WAKE.
            //    We'll see futex_val != val and return without sleeping.
            //
            //    Note that the fences do not create any happens-before relationship.
            //    The read sees the write immediately before the fence not because
            //    one happens after the other, but is instead due to a guarantee unique
            //    to SeqCst fences that restricts what an atomic read placed AFTER the
            //    fence can see. The read still has to be atomic, otherwise it's a data
            //    race. This guarantee cannot be achieved with acquire-release fences
            //    since they only talk about reads placed BEFORE a fence - and places
            //    no restrictions on what the read itself can see, only that there is
            //    a happens-before between the fences IF the read happens to see the
            //    right value. This is useless to us, since we need the read itself
            //    to see an up-to-date value.
            //
            // The above case distinction is valid since both FUTEX_WAIT and FUTEX_WAKE
            // contain a SeqCst fence, therefore inducing a total order between the operations.
            // It is also critical that the fence, the atomic load, and the comparison in FUTEX_WAIT
            // altogether happen atomically. If the other thread's fence in FUTEX_WAKE
            // gets interleaved after our fence, then we lose the guarantee on the
            // atomic load being up-to-date; if the other thread's write on addr and FUTEX_WAKE
            // call are interleaved after the load but before the comparison, then we get a TOCTOU
            // race condition, and go to sleep thinking the other thread will wake us up,
            // even though they have already finished.
            //
            // Thankfully, preemptions cannot happen inside a Miri shim, so we do not need to
            // do anything special to guarantee fence-load-comparison atomicity.
            ecx.atomic_fence(AtomicFenceOrd::SeqCst)?;
            // Read an `i32` through the pointer, regardless of any wrapper types.
            // It's not uncommon for `addr` to be passed as another type than `*mut i32`, such as `*const AtomicI32`.
            // We do an acquire read -- it only seems reasonable that if we observe a value here, we
            // actually establish an ordering with that value.
            let futex_val = ecx.read_scalar_atomic(&addr, AtomicReadOrd::Acquire)?.to_i32()?;
            if val == futex_val {
                // The value still matches, so we block the thread and make it wait for FUTEX_WAKE.

                // This cannot fail since we already did an atomic acquire read on that pointer.
                // Acquire reads are only allowed on mutable memory.
                let futex_ref = ecx
                    .get_sync_or_init(addr.ptr(), |_| LinuxFutex { futex: Default::default() })
                    .unwrap()
                    .futex
                    .clone();

                ecx.futex_wait(
                    futex_ref,
                    bitset,
                    timeout,
                    Scalar::from_target_isize(0, ecx), // retval_succ
                    Scalar::from_target_isize(-1, ecx), // retval_timeout
                    dest.clone(),
                    LibcError("ETIMEDOUT"), // errno_timeout
                );
            } else {
                // The futex value doesn't match the expected value, so we return failure
                // right away without sleeping: -1 and errno set to EAGAIN.
                return ecx.set_last_error_and_return(LibcError("EAGAIN"), dest);
            }
        }
        // FUTEX_WAKE: (int *addr, int op = FUTEX_WAKE, int val)
        // Wakes at most `val` threads waiting on the futex at `addr`.
        // Returns the amount of threads woken up.
        // Does not access the futex value at *addr.
        // FUTEX_WAKE_BITSET: (int *addr, int op = FUTEX_WAKE, int val, const timespect *_unused, int *_unused, unsigned int bitset)
        // Same as FUTEX_WAKE, but allows you to specify a bitset to select which threads to wake up.
        op if op == futex_wake || op == futex_wake_bitset => {
            let Some(futex_ref) =
                ecx.get_sync_or_init(addr.ptr(), |_| LinuxFutex { futex: Default::default() })
            else {
                // No AllocId, or no live allocation at that AllocId.
                // Return an error code. (That seems nicer than silently doing something non-intuitive.)
                // This means that if an address gets reused by a new allocation,
                // we'll use an independent futex queue for this... that seems acceptable.
                return ecx.set_last_error_and_return(LibcError("EFAULT"), dest);
            };
            let futex_ref = futex_ref.futex.clone();

            let bitset = if op == futex_wake_bitset {
                let [_, _, _, _, timeout, uaddr2, bitset] =
                    check_min_arg_count("`syscall(SYS_futex, FUTEX_WAKE_BITSET, ...)`", args)?;
                let _timeout = ecx.read_pointer(timeout)?;
                let _uaddr2 = ecx.read_pointer(uaddr2)?;
                ecx.read_scalar(bitset)?.to_u32()?
            } else {
                u32::MAX
            };
            if bitset == 0 {
                return ecx.set_last_error_and_return(LibcError("EINVAL"), dest);
            }
            // Together with the SeqCst fence in futex_wait, this makes sure that futex_wait
            // will see the latest value on addr which could be changed by our caller
            // before doing the syscall.
            ecx.atomic_fence(AtomicFenceOrd::SeqCst)?;
            let mut n = 0;
            #[expect(clippy::arithmetic_side_effects)]
            for _ in 0..val {
                if ecx.futex_wake(&futex_ref, bitset)? {
                    n += 1;
                } else {
                    break;
                }
            }
            ecx.write_scalar(Scalar::from_target_isize(n, ecx), dest)?;
        }
        op => throw_unsup_format!("Miri does not support `futex` syscall with op={}", op),
    }

    interp_ok(())
}