cargo/util/network/
retry.rs

1//! Utilities for retrying a network operation.
2//!
3//! Some network errors are considered "spurious", meaning it is not a real
4//! error (such as a 404 not found) and is likely a transient error (like a
5//! bad network connection) that we can hope will resolve itself shortly. The
6//! [`Retry`] type offers a way to repeatedly perform some kind of network
7//! operation with a delay if it detects one of these possibly transient
8//! errors.
9//!
10//! This supports errors from [`git2`], [`gix`], [`curl`], and
11//! [`HttpNotSuccessful`] 5xx HTTP errors.
12//!
13//! The number of retries can be configured by the user via the `net.retry`
14//! config option. This indicates the number of times to retry the operation
15//! (default 3 times for a total of 4 attempts).
16//!
17//! There are hard-coded constants that indicate how long to sleep between
18//! retries. The constants are tuned to balance a few factors, such as the
19//! responsiveness to the user (we don't want cargo to hang for too long
20//! retrying things), and accommodating things like Cloudfront's default
21//! negative TTL of 10 seconds (if Cloudfront gets a 5xx error for whatever
22//! reason it won't try to fetch again for 10 seconds).
23//!
24//! The timeout also implements a primitive form of random jitter. This is so
25//! that if multiple requests fail at the same time that they don't all flood
26//! the server at the same time when they are retried. This jitter still has
27//! some clumping behavior, but should be good enough.
28//!
29//! [`Retry`] is the core type for implementing retry logic. The
30//! [`Retry::try`] method can be called with a callback, and it will
31//! indicate if it needs to be called again sometime in the future if there
32//! was a possibly transient error. The caller is responsible for sleeping the
33//! appropriate amount of time and then calling [`Retry::try`] again.
34//!
35//! [`with_retry`] is a convenience function that will create a [`Retry`] and
36//! handle repeatedly running a callback until it succeeds, or it runs out of
37//! retries.
38//!
39//! Some interesting resources about retries:
40//! - <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
41//! - <https://en.wikipedia.org/wiki/Exponential_backoff>
42//! - <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After>
43
44use crate::util::errors::HttpNotSuccessful;
45use crate::{CargoResult, GlobalContext};
46use anyhow::Error;
47use rand::Rng;
48use std::cmp::min;
49use std::time::Duration;
50
51/// State for managing retrying a network operation.
52pub struct Retry<'a> {
53    gctx: &'a GlobalContext,
54    /// The number of failed attempts that have been done so far.
55    ///
56    /// Starts at 0, and increases by one each time an attempt fails.
57    retries: u64,
58    /// The maximum number of times the operation should be retried.
59    ///
60    /// 0 means it should never retry.
61    max_retries: u64,
62}
63
64/// The result of attempting some operation via [`Retry::try`].
65pub enum RetryResult<T> {
66    /// The operation was successful.
67    ///
68    /// The wrapped value is the return value of the callback function.
69    Success(T),
70    /// The operation was an error, and it should not be tried again.
71    Err(anyhow::Error),
72    /// The operation failed, and should be tried again in the future.
73    ///
74    /// The wrapped value is the number of milliseconds to wait before trying
75    /// again. The caller is responsible for waiting this long and then
76    /// calling [`Retry::try`] again.
77    Retry(u64),
78}
79
80/// Maximum amount of time a single retry can be delayed (milliseconds).
81const MAX_RETRY_SLEEP_MS: u64 = 10 * 1000;
82/// The minimum initial amount of time a retry will be delayed (milliseconds).
83///
84/// The actual amount of time will be a random value above this.
85const INITIAL_RETRY_SLEEP_BASE_MS: u64 = 500;
86/// The maximum amount of additional time the initial retry will take (milliseconds).
87///
88/// The initial delay will be [`INITIAL_RETRY_SLEEP_BASE_MS`] plus a random range
89/// from 0 to this value.
90const INITIAL_RETRY_JITTER_MS: u64 = 1000;
91
92impl<'a> Retry<'a> {
93    pub fn new(gctx: &'a GlobalContext) -> CargoResult<Retry<'a>> {
94        Ok(Retry {
95            gctx,
96            retries: 0,
97            max_retries: gctx.net_config()?.retry.unwrap_or(3) as u64,
98        })
99    }
100
101    /// Calls the given callback, and returns a [`RetryResult`] which
102    /// indicates whether or not this needs to be called again at some point
103    /// in the future to retry the operation if it failed.
104    pub fn r#try<T>(&mut self, f: impl FnOnce() -> CargoResult<T>) -> RetryResult<T> {
105        match f() {
106            Err(ref e) if maybe_spurious(e) && self.retries < self.max_retries => {
107                let err_msg = e
108                    .downcast_ref::<HttpNotSuccessful>()
109                    .map(|http_err| http_err.display_short())
110                    .unwrap_or_else(|| e.root_cause().to_string());
111                let msg = format!(
112                    "spurious network error ({} tries remaining): {err_msg}",
113                    self.max_retries - self.retries,
114                );
115                if let Err(e) = self.gctx.shell().warn(msg) {
116                    return RetryResult::Err(e);
117                }
118                self.retries += 1;
119                RetryResult::Retry(self.next_sleep_ms())
120            }
121            Err(e) => RetryResult::Err(e),
122            Ok(r) => RetryResult::Success(r),
123        }
124    }
125
126    /// Gets the next sleep duration in milliseconds.
127    fn next_sleep_ms(&self) -> u64 {
128        if let Ok(sleep) = self.gctx.get_env("__CARGO_TEST_FIXED_RETRY_SLEEP_MS") {
129            return sleep.parse().expect("a u64");
130        }
131
132        if self.retries == 1 {
133            let mut rng = rand::rng();
134            INITIAL_RETRY_SLEEP_BASE_MS + rng.random_range(0..INITIAL_RETRY_JITTER_MS)
135        } else {
136            min(
137                ((self.retries - 1) * 3) * 1000 + INITIAL_RETRY_SLEEP_BASE_MS,
138                MAX_RETRY_SLEEP_MS,
139            )
140        }
141    }
142}
143
144fn maybe_spurious(err: &Error) -> bool {
145    if let Some(git_err) = err.downcast_ref::<git2::Error>() {
146        match git_err.class() {
147            git2::ErrorClass::Net
148            | git2::ErrorClass::Os
149            | git2::ErrorClass::Zlib
150            | git2::ErrorClass::Http => return git_err.code() != git2::ErrorCode::Certificate,
151            _ => (),
152        }
153    }
154    if let Some(curl_err) = err.downcast_ref::<curl::Error>() {
155        if curl_err.is_couldnt_connect()
156            || curl_err.is_couldnt_resolve_proxy()
157            || curl_err.is_couldnt_resolve_host()
158            || curl_err.is_operation_timedout()
159            || curl_err.is_recv_error()
160            || curl_err.is_send_error()
161            || curl_err.is_http2_error()
162            || curl_err.is_http2_stream_error()
163            || curl_err.is_ssl_connect_error()
164            || curl_err.is_partial_file()
165        {
166            return true;
167        }
168    }
169    if let Some(not_200) = err.downcast_ref::<HttpNotSuccessful>() {
170        if 500 <= not_200.code && not_200.code < 600 {
171            return true;
172        }
173    }
174
175    use gix::protocol::transport::IsSpuriousError;
176
177    if let Some(err) = err.downcast_ref::<crate::sources::git::fetch::Error>() {
178        if err.is_spurious() {
179            return true;
180        }
181    }
182
183    false
184}
185
186/// Wrapper method for network call retry logic.
187///
188/// Retry counts provided by Config object `net.retry`. Config shell outputs
189/// a warning on per retry.
190///
191/// Closure must return a `CargoResult`.
192///
193/// # Examples
194///
195/// ```
196/// # use crate::cargo::util::{CargoResult, GlobalContext};
197/// # let download_something = || return Ok(());
198/// # let gctx = GlobalContext::default().unwrap();
199/// use cargo::util::network;
200/// let cargo_result = network::retry::with_retry(&gctx, || download_something());
201/// ```
202pub fn with_retry<T, F>(gctx: &GlobalContext, mut callback: F) -> CargoResult<T>
203where
204    F: FnMut() -> CargoResult<T>,
205{
206    let mut retry = Retry::new(gctx)?;
207    loop {
208        match retry.r#try(&mut callback) {
209            RetryResult::Success(r) => return Ok(r),
210            RetryResult::Err(e) => return Err(e),
211            RetryResult::Retry(sleep) => std::thread::sleep(Duration::from_millis(sleep)),
212        }
213    }
214}
215
216#[test]
217fn with_retry_repeats_the_call_then_works() {
218    use crate::core::Shell;
219
220    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
221    let error1 = HttpNotSuccessful {
222        code: 501,
223        url: "Uri".to_string(),
224        ip: None,
225        body: Vec::new(),
226        headers: Vec::new(),
227    }
228    .into();
229    let error2 = HttpNotSuccessful {
230        code: 502,
231        url: "Uri".to_string(),
232        ip: None,
233        body: Vec::new(),
234        headers: Vec::new(),
235    }
236    .into();
237    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
238    let gctx = GlobalContext::default().unwrap();
239    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
240    let result = with_retry(&gctx, || results.pop().unwrap());
241    assert!(result.is_ok())
242}
243
244#[test]
245fn with_retry_finds_nested_spurious_errors() {
246    use crate::core::Shell;
247
248    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
249    //String error messages are not considered spurious
250    let error1 = anyhow::Error::from(HttpNotSuccessful {
251        code: 501,
252        url: "Uri".to_string(),
253        ip: None,
254        body: Vec::new(),
255        headers: Vec::new(),
256    });
257    let error1 = anyhow::Error::from(error1.context("A non-spurious wrapping err"));
258    let error2 = anyhow::Error::from(HttpNotSuccessful {
259        code: 502,
260        url: "Uri".to_string(),
261        ip: None,
262        body: Vec::new(),
263        headers: Vec::new(),
264    });
265    let error2 = anyhow::Error::from(error2.context("A second chained error"));
266    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
267    let gctx = GlobalContext::default().unwrap();
268    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
269    let result = with_retry(&gctx, || results.pop().unwrap());
270    assert!(result.is_ok())
271}
272
273#[test]
274fn default_retry_schedule() {
275    use crate::core::Shell;
276
277    let spurious = || -> CargoResult<()> {
278        Err(anyhow::Error::from(HttpNotSuccessful {
279            code: 500,
280            url: "Uri".to_string(),
281            ip: None,
282            body: Vec::new(),
283            headers: Vec::new(),
284        }))
285    };
286    let gctx = GlobalContext::default().unwrap();
287    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
288    let mut retry = Retry::new(&gctx).unwrap();
289    match retry.r#try(|| spurious()) {
290        RetryResult::Retry(sleep) => {
291            assert!(
292                sleep >= INITIAL_RETRY_SLEEP_BASE_MS
293                    && sleep < INITIAL_RETRY_SLEEP_BASE_MS + INITIAL_RETRY_JITTER_MS
294            );
295        }
296        _ => panic!("unexpected non-retry"),
297    }
298    match retry.r#try(|| spurious()) {
299        RetryResult::Retry(sleep) => assert_eq!(sleep, 3500),
300        _ => panic!("unexpected non-retry"),
301    }
302    match retry.r#try(|| spurious()) {
303        RetryResult::Retry(sleep) => assert_eq!(sleep, 6500),
304        _ => panic!("unexpected non-retry"),
305    }
306    match retry.r#try(|| spurious()) {
307        RetryResult::Err(_) => {}
308        _ => panic!("unexpected non-retry"),
309    }
310}
311
312#[test]
313fn curle_http2_stream_is_spurious() {
314    let code = curl_sys::CURLE_HTTP2_STREAM;
315    let err = curl::Error::new(code);
316    assert!(maybe_spurious(&err.into()));
317}