cargo/sources/registry/
remote.rs

1//! Access to a Git index based registry. See [`RemoteRegistry`] for details.
2
3use crate::core::global_cache_tracker;
4use crate::core::{GitReference, PackageId, SourceId};
5use crate::sources::git;
6use crate::sources::git::fetch::RemoteKind;
7use crate::sources::git::resolve_ref;
8use crate::sources::registry::MaybeLock;
9use crate::sources::registry::download;
10use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData};
11use crate::util::cache_lock::CacheLockMode;
12use crate::util::errors::CargoResult;
13use crate::util::interning::InternedString;
14use crate::util::{Filesystem, GlobalContext, OnceExt};
15use anyhow::Context as _;
16use cargo_util::paths;
17use std::cell::OnceCell;
18use std::cell::{Cell, Ref, RefCell};
19use std::fs::File;
20use std::mem;
21use std::path::Path;
22use std::str;
23use std::task::{Poll, ready};
24use tracing::{debug, trace};
25
26/// A remote registry is a registry that lives at a remote URL (such as
27/// crates.io). The git index is cloned locally, and `.crate` files are
28/// downloaded as needed and cached locally.
29///
30/// This type is primarily accessed through the [`RegistryData`] trait.
31///
32/// See the [module-level documentation](super) for the index format and layout.
33///
34/// ## History of Git-based index registry
35///
36/// Using Git to host this index used to be quite efficient. The full index can
37/// be stored efficiently locally on disk, and once it is downloaded, all
38/// queries of a registry can happen locally and needn't touch the network.
39/// Git-based index was a reasonable design choice at the time when HTTP/2
40/// was just introduced.
41///
42/// However, the full index keeps growing as crates.io grows. It becomes
43/// relatively big and slows down the first use of Cargo. Git (specifically
44/// libgit2) is not efficient at handling huge amounts of small files either.
45/// On the other hand, newer protocols like HTTP/2 are prevalent and capable to
46/// serve a bunch of tiny files. Today, it is encouraged to use [`HttpRegistry`],
47/// which is the default from 1.70.0. That being said, Cargo will continue
48/// supporting Git-based index for a pretty long while.
49///
50/// [`HttpRegistry`]: super::http_remote::HttpRegistry
51pub struct RemoteRegistry<'gctx> {
52    /// The name of this source, a unique string (across all sources) used as
53    /// the directory name where its cached content is stored.
54    name: InternedString,
55    /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`).
56    index_path: Filesystem,
57    /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
58    cache_path: Filesystem,
59    /// The unique identifier of this registry source.
60    source_id: SourceId,
61    /// This reference is stored so that when a registry needs update, it knows
62    /// where to fetch from.
63    index_git_ref: GitReference,
64    gctx: &'gctx GlobalContext,
65    /// A Git [tree object] to help this registry find crate metadata from the
66    /// underlying Git repository.
67    ///
68    /// This is stored here to prevent Git from repeatedly creating a tree object
69    /// during each call into `load()`.
70    ///
71    /// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects
72    tree: RefCell<Option<git2::Tree<'static>>>,
73    /// A Git repository that contains the actual index we want.
74    repo: OnceCell<git2::Repository>,
75    /// The current HEAD commit of the underlying Git repository.
76    head: Cell<Option<git2::Oid>>,
77    /// This stores sha value of the current HEAD commit for convenience.
78    current_sha: Cell<Option<InternedString>>,
79    /// Whether this registry needs to update package information.
80    ///
81    /// See [`RemoteRegistry::mark_updated`] on how to make sure a registry
82    /// index is updated only once per session.
83    needs_update: bool,
84    /// Disables status messages.
85    quiet: bool,
86}
87
88impl<'gctx> RemoteRegistry<'gctx> {
89    /// Creates a Git-rebased remote registry for `source_id`.
90    ///
91    /// * `name` --- Name of a path segment where `.crate` tarballs and the
92    ///   registry index are stored. Expect to be unique.
93    pub fn new(
94        source_id: SourceId,
95        gctx: &'gctx GlobalContext,
96        name: &str,
97    ) -> RemoteRegistry<'gctx> {
98        RemoteRegistry {
99            name: name.into(),
100            index_path: gctx.registry_index_path().join(name),
101            cache_path: gctx.registry_cache_path().join(name),
102            source_id,
103            gctx,
104            index_git_ref: GitReference::DefaultBranch,
105            tree: RefCell::new(None),
106            repo: OnceCell::new(),
107            head: Cell::new(None),
108            current_sha: Cell::new(None),
109            needs_update: false,
110            quiet: false,
111        }
112    }
113
114    /// Creates intermediate dirs and initialize the repository.
115    fn repo(&self) -> CargoResult<&git2::Repository> {
116        self.repo.try_borrow_with(|| {
117            trace!("acquiring registry index lock");
118            let path = self
119                .gctx
120                .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
121
122            match git2::Repository::open(&path) {
123                Ok(repo) => Ok(repo),
124                Err(_) => {
125                    drop(paths::remove_dir_all(&path));
126                    paths::create_dir_all(&path)?;
127
128                    // Note that we'd actually prefer to use a bare repository
129                    // here as we're not actually going to check anything out.
130                    // All versions of Cargo, though, share the same CARGO_HOME,
131                    // so for compatibility with older Cargo which *does* do
132                    // checkouts we make sure to initialize a new full
133                    // repository (not a bare one).
134                    //
135                    // We should change this to `init_bare` whenever we feel
136                    // like enough time has passed or if we change the directory
137                    // that the folder is located in, such as by changing the
138                    // hash at the end of the directory.
139                    //
140                    // Note that in the meantime we also skip `init.templatedir`
141                    // as it can be misconfigured sometimes or otherwise add
142                    // things that we don't want.
143                    let mut opts = git2::RepositoryInitOptions::new();
144                    opts.external_template(false);
145                    Ok(git2::Repository::init_opts(&path, &opts).with_context(|| {
146                        format!("failed to initialize index git repository (in {:?})", path)
147                    })?)
148                }
149            }
150        })
151    }
152
153    /// Get the object ID of the HEAD commit from the underlying Git repository.
154    fn head(&self) -> CargoResult<git2::Oid> {
155        if self.head.get().is_none() {
156            let repo = self.repo()?;
157            let oid = resolve_ref(&self.index_git_ref, repo)?;
158            self.head.set(Some(oid));
159        }
160        Ok(self.head.get().unwrap())
161    }
162
163    /// Returns a [`git2::Tree`] object of the current HEAD commit of the
164    /// underlying Git repository.
165    fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
166        {
167            let tree = self.tree.borrow();
168            if tree.is_some() {
169                return Ok(Ref::map(tree, |s| s.as_ref().unwrap()));
170            }
171        }
172        let repo = self.repo()?;
173        let commit = repo.find_commit(self.head()?)?;
174        let tree = commit.tree()?;
175
176        // SAFETY:
177        // Unfortunately in libgit2 the tree objects look like they've got a
178        // reference to the repository object which means that a tree cannot
179        // outlive the repository that it came from. Here we want to cache this
180        // tree, though, so to accomplish this we transmute it to a static
181        // lifetime.
182        //
183        // Note that we don't actually hand out the static lifetime, instead we
184        // only return a scoped one from this function. Additionally the repo
185        // we loaded from (above) lives as long as this object
186        // (`RemoteRegistry`) so we then just need to ensure that the tree is
187        // destroyed first in the destructor, hence the destructor on
188        // `RemoteRegistry` below.
189        let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) };
190        *self.tree.borrow_mut() = Some(tree);
191        Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
192    }
193
194    /// Gets the current version of the registry index.
195    ///
196    /// It is usually sha of the HEAD commit from the underlying Git repository.
197    fn current_version(&self) -> Option<InternedString> {
198        if let Some(sha) = self.current_sha.get() {
199            return Some(sha);
200        }
201        let sha = self.head().ok()?.to_string().into();
202        self.current_sha.set(Some(sha));
203        Some(sha)
204    }
205
206    /// Whether the registry is up-to-date. See [`Self::mark_updated`] for more.
207    fn is_updated(&self) -> bool {
208        self.gctx.updated_sources().contains(&self.source_id)
209    }
210
211    /// Marks this registry as up-to-date.
212    ///
213    /// This makes sure the index is only updated once per session since it is
214    /// an expensive operation. This generally only happens when the resolver
215    /// is run multiple times, such as during `cargo publish`.
216    fn mark_updated(&self) {
217        self.gctx.updated_sources().insert(self.source_id);
218    }
219}
220
221impl<'gctx> RegistryData for RemoteRegistry<'gctx> {
222    fn prepare(&self) -> CargoResult<()> {
223        self.repo()?;
224        self.gctx
225            .deferred_global_last_use()?
226            .mark_registry_index_used(global_cache_tracker::RegistryIndex {
227                encoded_registry_name: self.name,
228            });
229        Ok(())
230    }
231
232    fn index_path(&self) -> &Filesystem {
233        &self.index_path
234    }
235
236    fn cache_path(&self) -> &Filesystem {
237        &self.cache_path
238    }
239
240    fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
241        self.gctx
242            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, path)
243    }
244
245    /// Read the general concept for `load()` on [`RegistryData::load`].
246    ///
247    /// `index_version` is a string representing the version of the file used
248    /// to construct the cached copy.
249    ///
250    /// Older versions of Cargo used the single value of the hash of the HEAD
251    /// commit as a `index_version`. This is technically correct but a little
252    /// too conservative. If a new commit is fetched all cached files need to
253    /// be regenerated even if a particular file was not changed.
254    ///
255    /// However if an old cargo has written such a file we still know how to
256    /// read it, as long as we check for that hash value.
257    ///
258    /// Cargo now uses a hash of the file's contents as provided by git.
259    fn load(
260        &mut self,
261        _root: &Path,
262        path: &Path,
263        index_version: Option<&str>,
264    ) -> Poll<CargoResult<LoadResponse>> {
265        if self.needs_update {
266            return Poll::Pending;
267        }
268        // Check if the cache is valid.
269        let git_commit_hash = self.current_version();
270        if index_version.is_some() && index_version == git_commit_hash.as_deref() {
271            // This file was written by an old version of cargo, but it is
272            // still up-to-date.
273            return Poll::Ready(Ok(LoadResponse::CacheValid));
274        }
275        // Note that the index calls this method and the filesystem is locked
276        // in the index, so we don't need to worry about an `update_index`
277        // happening in a different process.
278        fn load_helper(
279            registry: &RemoteRegistry<'_>,
280            path: &Path,
281            index_version: Option<&str>,
282        ) -> CargoResult<LoadResponse> {
283            let repo = registry.repo()?;
284            let tree = registry.tree()?;
285            let entry = tree.get_path(path);
286            let entry = entry?;
287            let git_file_hash = Some(entry.id().to_string());
288
289            // Check if the cache is valid.
290            if index_version.is_some() && index_version == git_file_hash.as_deref() {
291                return Ok(LoadResponse::CacheValid);
292            }
293
294            let object = entry.to_object(repo)?;
295            let Some(blob) = object.as_blob() else {
296                anyhow::bail!("path `{}` is not a blob in the git repo", path.display())
297            };
298
299            Ok(LoadResponse::Data {
300                raw_data: blob.content().to_vec(),
301                index_version: git_file_hash,
302            })
303        }
304
305        match load_helper(&self, path, index_version) {
306            Ok(result) => Poll::Ready(Ok(result)),
307            Err(_) if !self.is_updated() => {
308                // If git returns an error and we haven't updated the repo,
309                // return pending to allow an update to try again.
310                self.needs_update = true;
311                Poll::Pending
312            }
313            Err(e)
314                if e.downcast_ref::<git2::Error>()
315                    .map(|e| e.code() == git2::ErrorCode::NotFound)
316                    .unwrap_or_default() =>
317            {
318                // The repo has been updated and the file does not exist.
319                Poll::Ready(Ok(LoadResponse::NotFound))
320            }
321            Err(e) => Poll::Ready(Err(e)),
322        }
323    }
324
325    fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
326        debug!("loading config");
327        self.prepare()?;
328        self.gctx
329            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
330        match ready!(self.load(Path::new(""), Path::new(RegistryConfig::NAME), None)?) {
331            LoadResponse::Data { raw_data, .. } => {
332                trace!("config loaded");
333                let cfg: RegistryConfig = serde_json::from_slice(&raw_data)?;
334                Poll::Ready(Ok(Some(cfg)))
335            }
336            _ => Poll::Ready(Ok(None)),
337        }
338    }
339
340    fn block_until_ready(&mut self) -> CargoResult<()> {
341        if !self.needs_update {
342            return Ok(());
343        }
344
345        self.needs_update = false;
346
347        if self.is_updated() {
348            return Ok(());
349        }
350        self.mark_updated();
351
352        if !self.gctx.network_allowed() {
353            return Ok(());
354        }
355        if self.gctx.cli_unstable().no_index_update {
356            return Ok(());
357        }
358
359        debug!("updating the index");
360
361        // Ensure that we'll actually be able to acquire an HTTP handle later on
362        // once we start trying to download crates. This will weed out any
363        // problems with `.cargo/config` configuration related to HTTP.
364        //
365        // This way if there's a problem the error gets printed before we even
366        // hit the index, which may not actually read this configuration.
367        self.gctx.http()?;
368
369        self.prepare()?;
370        self.head.set(None);
371        *self.tree.borrow_mut() = None;
372        self.current_sha.set(None);
373        let _path = self
374            .gctx
375            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
376        if !self.quiet {
377            self.gctx
378                .shell()
379                .status("Updating", self.source_id.display_index())?;
380        }
381
382        // Fetch the latest version of our `index_git_ref` into the index
383        // checkout.
384        let url = self.source_id.url();
385        let repo = self.repo.get_mut().unwrap();
386        git::fetch(
387            repo,
388            url.as_str(),
389            &self.index_git_ref,
390            self.gctx,
391            RemoteKind::Registry,
392        )
393        .with_context(|| format!("failed to fetch `{}`", url))?;
394
395        Ok(())
396    }
397
398    /// Read the general concept for `invalidate_cache()` on
399    /// [`RegistryData::invalidate_cache`].
400    ///
401    /// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work.
402    fn invalidate_cache(&mut self) {
403        self.needs_update = true;
404    }
405
406    fn set_quiet(&mut self, quiet: bool) {
407        self.quiet = quiet;
408    }
409
410    fn is_updated(&self) -> bool {
411        self.is_updated()
412    }
413
414    fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
415        let registry_config = loop {
416            match self.config()? {
417                Poll::Pending => self.block_until_ready()?,
418                Poll::Ready(cfg) => break cfg.unwrap(),
419            }
420        };
421
422        download::download(
423            &self.cache_path,
424            &self.gctx,
425            self.name,
426            pkg,
427            checksum,
428            registry_config,
429        )
430    }
431
432    fn finish_download(
433        &mut self,
434        pkg: PackageId,
435        checksum: &str,
436        data: &[u8],
437    ) -> CargoResult<File> {
438        download::finish_download(
439            &self.cache_path,
440            &self.gctx,
441            self.name.clone(),
442            pkg,
443            checksum,
444            data,
445        )
446    }
447
448    fn is_crate_downloaded(&self, pkg: PackageId) -> bool {
449        download::is_crate_downloaded(&self.cache_path, &self.gctx, pkg)
450    }
451}
452
453/// Implemented to just be sure to drop `tree` field before our other fields.
454/// See SAFETY inside [`RemoteRegistry::tree()`] for more.
455impl<'gctx> Drop for RemoteRegistry<'gctx> {
456    fn drop(&mut self) {
457        self.tree.borrow_mut().take();
458    }
459}