Skip to main content

cargo/sources/registry/
remote.rs

1//! Access to a Git index based registry. See [`RemoteRegistry`] for details.
2
3use crate::core::global_cache_tracker;
4use crate::core::{GitReference, PackageId, SourceId};
5use crate::sources::git;
6use crate::sources::git::fetch::RemoteKind;
7use crate::sources::git::resolve_ref;
8use crate::sources::registry::MaybeLock;
9use crate::sources::registry::download;
10use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData};
11use crate::util::cache_lock::CacheLockMode;
12use crate::util::errors::CargoResult;
13use crate::util::interning::InternedString;
14use crate::util::{Filesystem, GlobalContext};
15use anyhow::Context as _;
16use cargo_util::paths;
17use std::cell::{Cell, Ref, RefCell};
18use std::fs::File;
19use std::mem;
20use std::path::Path;
21use std::str;
22use tracing::{debug, trace};
23
24/// A remote registry is a registry that lives at a remote URL (such as
25/// crates.io). The git index is cloned locally, and `.crate` files are
26/// downloaded as needed and cached locally.
27///
28/// This type is primarily accessed through the [`RegistryData`] trait.
29///
30/// See the [module-level documentation](super) for the index format and layout.
31///
32/// ## History of Git-based index registry
33///
34/// Using Git to host this index used to be quite efficient. The full index can
35/// be stored efficiently locally on disk, and once it is downloaded, all
36/// queries of a registry can happen locally and needn't touch the network.
37/// Git-based index was a reasonable design choice at the time when HTTP/2
38/// was just introduced.
39///
40/// However, the full index keeps growing as crates.io grows. It becomes
41/// relatively big and slows down the first use of Cargo. Git (specifically
42/// libgit2) is not efficient at handling huge amounts of small files either.
43/// On the other hand, newer protocols like HTTP/2 are prevalent and capable to
44/// serve a bunch of tiny files. Today, it is encouraged to use [`HttpRegistry`],
45/// which is the default from 1.70.0. That being said, Cargo will continue
46/// supporting Git-based index for a pretty long while.
47///
48/// [`HttpRegistry`]: super::http_remote::HttpRegistry
49pub struct RemoteRegistry<'gctx> {
50    /// The name of this source, a unique string (across all sources) used as
51    /// the directory name where its cached content is stored.
52    name: InternedString,
53    /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`).
54    index_path: Filesystem,
55    /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
56    cache_path: Filesystem,
57    /// The unique identifier of this registry source.
58    source_id: SourceId,
59    /// This reference is stored so that when a registry needs update, it knows
60    /// where to fetch from.
61    index_git_ref: GitReference,
62    gctx: &'gctx GlobalContext,
63    /// A Git [tree object] to help this registry find crate metadata from the
64    /// underlying Git repository.
65    ///
66    /// This is stored here to prevent Git from repeatedly creating a tree object
67    /// during each call into `load()`.
68    ///
69    /// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects
70    tree: RefCell<Option<git2::Tree<'static>>>,
71    /// A Git repository that contains the actual index we want.
72    repo: RefCell<Option<git2::Repository>>,
73    /// The current HEAD commit of the underlying Git repository.
74    head: Cell<Option<git2::Oid>>,
75    /// This stores sha value of the current HEAD commit for convenience.
76    current_sha: Cell<Option<InternedString>>,
77    /// Whether this registry needs to update package information.
78    ///
79    /// See [`RemoteRegistry::mark_updated`] on how to make sure a registry
80    /// index is updated only once per session.
81    needs_update: Cell<bool>,
82    /// Disables status messages.
83    quiet: bool,
84}
85
86impl<'gctx> RemoteRegistry<'gctx> {
87    /// Creates a Git-rebased remote registry for `source_id`.
88    ///
89    /// * `name` --- Name of a path segment where `.crate` tarballs and the
90    ///   registry index are stored. Expect to be unique.
91    pub fn new(
92        source_id: SourceId,
93        gctx: &'gctx GlobalContext,
94        name: &str,
95    ) -> RemoteRegistry<'gctx> {
96        RemoteRegistry {
97            name: name.into(),
98            index_path: gctx.registry_index_path().join(name),
99            cache_path: gctx.registry_cache_path().join(name),
100            source_id,
101            gctx,
102            index_git_ref: GitReference::DefaultBranch,
103            tree: RefCell::new(None),
104            repo: RefCell::new(None),
105            head: Cell::new(None),
106            current_sha: Cell::new(None),
107            needs_update: Cell::new(false),
108            quiet: false,
109        }
110    }
111
112    /// Creates intermediate dirs and initialize the repository.
113    fn repo(&self) -> CargoResult<Ref<'_, Option<git2::Repository>>> {
114        if self.repo.borrow().is_none() {
115            trace!("acquiring registry index lock");
116            let path = self
117                .gctx
118                .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
119
120            self.repo.replace(Some(match git2::Repository::open(&path) {
121                Ok(repo) => repo,
122                Err(_) => {
123                    drop(paths::remove_dir_all(&path));
124                    paths::create_dir_all(&path)?;
125
126                    // Note that we'd actually prefer to use a bare repository
127                    // here as we're not actually going to check anything out.
128                    // All versions of Cargo, though, share the same CARGO_HOME,
129                    // so for compatibility with older Cargo which *does* do
130                    // checkouts we make sure to initialize a new full
131                    // repository (not a bare one).
132                    //
133                    // We should change this to `init_bare` whenever we feel
134                    // like enough time has passed or if we change the directory
135                    // that the folder is located in, such as by changing the
136                    // hash at the end of the directory.
137                    //
138                    // Note that in the meantime we also skip `init.templatedir`
139                    // as it can be misconfigured sometimes or otherwise add
140                    // things that we don't want.
141                    let mut opts = git2::RepositoryInitOptions::new();
142                    opts.external_template(false);
143                    git2::Repository::init_opts(&path, &opts).with_context(|| {
144                        format!("failed to initialize index git repository (in {:?})", path)
145                    })?
146                }
147            }));
148        }
149
150        Ok(self.repo.borrow())
151    }
152
153    /// Get the object ID of the HEAD commit from the underlying Git repository.
154    fn head(&self) -> CargoResult<git2::Oid> {
155        if self.head.get().is_none() {
156            let repo = self.repo()?;
157            let repo = repo.as_ref().unwrap();
158            let oid = resolve_ref(&self.index_git_ref, repo)?;
159            self.head.set(Some(oid));
160        }
161        Ok(self.head.get().unwrap())
162    }
163
164    /// Returns a [`git2::Tree`] object of the current HEAD commit of the
165    /// underlying Git repository.
166    fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
167        {
168            let tree = self.tree.borrow();
169            if tree.is_some() {
170                return Ok(Ref::map(tree, |s| s.as_ref().unwrap()));
171            }
172        }
173        let repo = self.repo()?;
174        let repo = repo.as_ref().unwrap();
175        let commit = repo.find_commit(self.head()?)?;
176        let tree = commit.tree()?;
177
178        // SAFETY:
179        // Unfortunately in libgit2 the tree objects look like they've got a
180        // reference to the repository object which means that a tree cannot
181        // outlive the repository that it came from. Here we want to cache this
182        // tree, though, so to accomplish this we transmute it to a static
183        // lifetime.
184        //
185        // Note that we don't actually hand out the static lifetime, instead we
186        // only return a scoped one from this function. Additionally the repo
187        // we loaded from (above) lives as long as this object
188        // (`RemoteRegistry`) so we then just need to ensure that the tree is
189        // destroyed first in the destructor, hence the destructor on
190        // `RemoteRegistry` below.
191        let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) };
192        *self.tree.borrow_mut() = Some(tree);
193        Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
194    }
195
196    /// Gets the current version of the registry index.
197    ///
198    /// It is usually sha of the HEAD commit from the underlying Git repository.
199    fn current_version(&self) -> Option<InternedString> {
200        if let Some(sha) = self.current_sha.get() {
201            return Some(sha);
202        }
203        let sha = self.head().ok()?.to_string().into();
204        self.current_sha.set(Some(sha));
205        Some(sha)
206    }
207
208    /// Whether the registry is up-to-date. See [`Self::mark_updated`] for more.
209    fn is_updated(&self) -> bool {
210        self.gctx.updated_sources().contains(&self.source_id)
211    }
212
213    /// Marks this registry as up-to-date.
214    ///
215    /// This makes sure the index is only updated once per session since it is
216    /// an expensive operation. This generally only happens when the resolver
217    /// is run multiple times, such as during `cargo publish`.
218    fn mark_updated(&self) {
219        self.gctx.updated_sources().insert(self.source_id);
220    }
221
222    fn update(&self) -> CargoResult<()> {
223        if !self.needs_update.get() {
224            return Ok(());
225        }
226
227        self.needs_update.set(false);
228
229        if self.is_updated() {
230            return Ok(());
231        }
232        self.mark_updated();
233
234        if !self.gctx.network_allowed() {
235            return Ok(());
236        }
237        if self.gctx.cli_unstable().no_index_update {
238            return Ok(());
239        }
240
241        debug!("updating the index");
242
243        // Ensure that we'll actually be able to acquire an HTTP handle later on
244        // once we start trying to download crates. This will weed out any
245        // problems with `.cargo/config` configuration related to HTTP.
246        //
247        // This way if there's a problem the error gets printed before we even
248        // hit the index, which may not actually read this configuration.
249        self.gctx.http()?;
250
251        self.prepare()?;
252        self.head.set(None);
253        *self.tree.borrow_mut() = None;
254        self.current_sha.set(None);
255        let _path = self
256            .gctx
257            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
258        if !self.quiet {
259            self.gctx
260                .shell()
261                .status("Updating", self.source_id.display_index())?;
262        }
263
264        // Fetch the latest version of our `index_git_ref` into the index
265        // checkout.
266        let url = self.source_id.url();
267        let mut repo = self.repo.borrow_mut();
268        let repo = repo.as_mut().unwrap();
269        git::fetch(
270            repo,
271            url.as_str(),
272            &self.index_git_ref,
273            &self.index_git_ref,
274            self.gctx,
275            RemoteKind::Registry,
276        )
277        .with_context(|| format!("failed to fetch `{}`", url))?;
278
279        Ok(())
280    }
281}
282
283#[async_trait::async_trait(?Send)]
284impl<'gctx> RegistryData for RemoteRegistry<'gctx> {
285    fn prepare(&self) -> CargoResult<()> {
286        self.repo()?;
287        self.gctx
288            .deferred_global_last_use()?
289            .mark_registry_index_used(global_cache_tracker::RegistryIndex {
290                encoded_registry_name: self.name,
291            });
292        Ok(())
293    }
294
295    fn index_path(&self) -> &Filesystem {
296        &self.index_path
297    }
298
299    fn cache_path(&self) -> &Filesystem {
300        &self.cache_path
301    }
302
303    fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
304        self.gctx
305            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, path)
306    }
307
308    /// Read the general concept for `load()` on [`RegistryData::load`].
309    ///
310    /// `index_version` is a string representing the version of the file used
311    /// to construct the cached copy.
312    ///
313    /// Older versions of Cargo used the single value of the hash of the HEAD
314    /// commit as a `index_version`. This is technically correct but a little
315    /// too conservative. If a new commit is fetched all cached files need to
316    /// be regenerated even if a particular file was not changed.
317    ///
318    /// However if an old cargo has written such a file we still know how to
319    /// read it, as long as we check for that hash value.
320    ///
321    /// Cargo now uses a hash of the file's contents as provided by git.
322    async fn load(
323        &self,
324        _root: &Path,
325        path: &Path,
326        index_version: Option<&str>,
327    ) -> CargoResult<LoadResponse> {
328        if self.needs_update.get() {
329            self.update()?;
330        }
331        // Check if the cache is valid.
332        let git_commit_hash = self.current_version();
333        if index_version.is_some() && index_version == git_commit_hash.as_deref() {
334            // This file was written by an old version of cargo, but it is
335            // still up-to-date.
336            return Ok(LoadResponse::CacheValid);
337        }
338        // Note that the index calls this method and the filesystem is locked
339        // in the index, so we don't need to worry about an `update_index`
340        // happening in a different process.
341        fn load_helper(
342            registry: &RemoteRegistry<'_>,
343            path: &Path,
344            index_version: Option<&str>,
345        ) -> CargoResult<LoadResponse> {
346            let repo = registry.repo()?;
347            let repo = repo.as_ref().unwrap();
348            let tree = registry.tree()?;
349            let entry = tree.get_path(path);
350            let entry = entry?;
351            let git_file_hash = Some(entry.id().to_string());
352
353            // Check if the cache is valid.
354            if index_version.is_some() && index_version == git_file_hash.as_deref() {
355                return Ok(LoadResponse::CacheValid);
356            }
357
358            let object = entry.to_object(repo)?;
359            let Some(blob) = object.as_blob() else {
360                anyhow::bail!("path `{}` is not a blob in the git repo", path.display())
361            };
362
363            Ok(LoadResponse::Data {
364                raw_data: blob.content().to_vec(),
365                index_version: git_file_hash,
366            })
367        }
368
369        loop {
370            return match load_helper(&self, path, index_version) {
371                Ok(result) => Ok(result),
372                Err(_) if !self.is_updated() => {
373                    // If git returns an error and we haven't updated the repo,
374                    // return pending to allow an update to try again.
375                    self.needs_update.set(true);
376                    self.update()?;
377                    continue;
378                }
379                Err(e)
380                    if e.downcast_ref::<git2::Error>()
381                        .map(|e| e.code() == git2::ErrorCode::NotFound)
382                        .unwrap_or_default() =>
383                {
384                    // The repo has been updated and the file does not exist.
385                    Ok(LoadResponse::NotFound)
386                }
387                Err(e) => Err(e),
388            };
389        }
390    }
391
392    async fn config(&self) -> CargoResult<Option<RegistryConfig>> {
393        debug!("loading config");
394        self.prepare()?;
395        self.gctx
396            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
397        match self
398            .load(Path::new(""), Path::new(RegistryConfig::NAME), None)
399            .await?
400        {
401            LoadResponse::Data { raw_data, .. } => {
402                trace!("config loaded");
403                let cfg: RegistryConfig = serde_json::from_slice(&raw_data)?;
404                Ok(Some(cfg))
405            }
406            _ => Ok(None),
407        }
408    }
409
410    /// Read the general concept for `invalidate_cache()` on
411    /// [`RegistryData::invalidate_cache`].
412    ///
413    /// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work.
414    fn invalidate_cache(&self) {
415        self.needs_update.set(true);
416    }
417
418    fn set_quiet(&mut self, quiet: bool) {
419        self.quiet = quiet;
420    }
421
422    fn is_updated(&self) -> bool {
423        self.is_updated()
424    }
425
426    fn download(&self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
427        let registry_config = crate::util::block_on(self.config())?.unwrap();
428
429        download::download(
430            &self.cache_path,
431            &self.gctx,
432            self.name,
433            pkg,
434            checksum,
435            registry_config,
436        )
437    }
438
439    fn finish_download(&self, pkg: PackageId, checksum: &str, data: &[u8]) -> CargoResult<File> {
440        download::finish_download(
441            &self.cache_path,
442            &self.gctx,
443            self.name.clone(),
444            pkg,
445            checksum,
446            data,
447        )
448    }
449
450    fn is_crate_downloaded(&self, pkg: PackageId) -> bool {
451        download::is_crate_downloaded(&self.cache_path, &self.gctx, pkg)
452    }
453}
454
455/// Implemented to just be sure to drop `tree` field before our other fields.
456/// See SAFETY inside [`RemoteRegistry::tree()`] for more.
457impl<'gctx> Drop for RemoteRegistry<'gctx> {
458    fn drop(&mut self) {
459        self.tree.borrow_mut().take();
460    }
461}