cargo/sources/registry/remote.rs
1//! Access to a Git index based registry. See [`RemoteRegistry`] for details.
2
3use crate::core::global_cache_tracker;
4use crate::core::{GitReference, PackageId, SourceId};
5use crate::sources::git;
6use crate::sources::git::fetch::RemoteKind;
7use crate::sources::git::resolve_ref;
8use crate::sources::registry::download;
9use crate::sources::registry::MaybeLock;
10use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData};
11use crate::util::cache_lock::CacheLockMode;
12use crate::util::errors::CargoResult;
13use crate::util::interning::InternedString;
14use crate::util::{Filesystem, GlobalContext};
15use anyhow::Context as _;
16use cargo_util::paths;
17use lazycell::LazyCell;
18use std::cell::{Cell, Ref, RefCell};
19use std::fs::File;
20use std::mem;
21use std::path::Path;
22use std::str;
23use std::task::{ready, Poll};
24use tracing::{debug, trace};
25
26/// A remote registry is a registry that lives at a remote URL (such as
27/// crates.io). The git index is cloned locally, and `.crate` files are
28/// downloaded as needed and cached locally.
29///
30/// This type is primarily accessed through the [`RegistryData`] trait.
31///
32/// See the [module-level documentation](super) for the index format and layout.
33///
34/// ## History of Git-based index registry
35///
36/// Using Git to host this index used to be quite efficient. The full index can
37/// be stored efficiently locally on disk, and once it is downloaded, all
38/// queries of a registry can happen locally and needn't touch the network.
39/// Git-based index was a reasonable design choice at the time when HTTP/2
40/// was just introduced.
41///
42/// However, the full index keeps growing as crates.io grows. It becomes
43/// relatively big and slows down the first use of Cargo. Git (specifically
44/// libgit2) is not efficient at handling huge amounts of small files either.
45/// On the other hand, newer protocols like HTTP/2 are prevalent and capable to
46/// serve a bunch of tiny files. Today, it is encouraged to use [`HttpRegistry`],
47/// which is the default from 1.70.0. That being said, Cargo will continue
48/// supporting Git-based index for a pretty long while.
49///
50/// [`HttpRegistry`]: super::http_remote::HttpRegistry
51pub struct RemoteRegistry<'gctx> {
52 /// The name of this source, a unique string (across all sources) used as
53 /// the directory name where its cached content is stored.
54 name: InternedString,
55 /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`).
56 index_path: Filesystem,
57 /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
58 cache_path: Filesystem,
59 /// The unique identifier of this registry source.
60 source_id: SourceId,
61 /// This reference is stored so that when a registry needs update, it knows
62 /// where to fetch from.
63 index_git_ref: GitReference,
64 gctx: &'gctx GlobalContext,
65 /// A Git [tree object] to help this registry find crate metadata from the
66 /// underlying Git repository.
67 ///
68 /// This is stored here to prevent Git from repeatedly creating a tree object
69 /// during each call into `load()`.
70 ///
71 /// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects
72 tree: RefCell<Option<git2::Tree<'static>>>,
73 /// A Git repository that contains the actual index we want.
74 repo: LazyCell<git2::Repository>,
75 /// The current HEAD commit of the underlying Git repository.
76 head: Cell<Option<git2::Oid>>,
77 /// This stores sha value of the current HEAD commit for convenience.
78 current_sha: Cell<Option<InternedString>>,
79 /// Whether this registry needs to update package information.
80 ///
81 /// See [`RemoteRegistry::mark_updated`] on how to make sure a registry
82 /// index is updated only once per session.
83 needs_update: bool,
84 /// Disables status messages.
85 quiet: bool,
86}
87
88impl<'gctx> RemoteRegistry<'gctx> {
89 /// Creates a Git-rebased remote registry for `source_id`.
90 ///
91 /// * `name` --- Name of a path segment where `.crate` tarballs and the
92 /// registry index are stored. Expect to be unique.
93 pub fn new(
94 source_id: SourceId,
95 gctx: &'gctx GlobalContext,
96 name: &str,
97 ) -> RemoteRegistry<'gctx> {
98 RemoteRegistry {
99 name: name.into(),
100 index_path: gctx.registry_index_path().join(name),
101 cache_path: gctx.registry_cache_path().join(name),
102 source_id,
103 gctx,
104 index_git_ref: GitReference::DefaultBranch,
105 tree: RefCell::new(None),
106 repo: LazyCell::new(),
107 head: Cell::new(None),
108 current_sha: Cell::new(None),
109 needs_update: false,
110 quiet: false,
111 }
112 }
113
114 /// Creates intermediate dirs and initialize the repository.
115 fn repo(&self) -> CargoResult<&git2::Repository> {
116 self.repo.try_borrow_with(|| {
117 trace!("acquiring registry index lock");
118 let path = self
119 .gctx
120 .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
121
122 match git2::Repository::open(&path) {
123 Ok(repo) => Ok(repo),
124 Err(_) => {
125 drop(paths::remove_dir_all(&path));
126 paths::create_dir_all(&path)?;
127
128 // Note that we'd actually prefer to use a bare repository
129 // here as we're not actually going to check anything out.
130 // All versions of Cargo, though, share the same CARGO_HOME,
131 // so for compatibility with older Cargo which *does* do
132 // checkouts we make sure to initialize a new full
133 // repository (not a bare one).
134 //
135 // We should change this to `init_bare` whenever we feel
136 // like enough time has passed or if we change the directory
137 // that the folder is located in, such as by changing the
138 // hash at the end of the directory.
139 //
140 // Note that in the meantime we also skip `init.templatedir`
141 // as it can be misconfigured sometimes or otherwise add
142 // things that we don't want.
143 let mut opts = git2::RepositoryInitOptions::new();
144 opts.external_template(false);
145 Ok(git2::Repository::init_opts(&path, &opts).with_context(|| {
146 format!("failed to initialize index git repository (in {:?})", path)
147 })?)
148 }
149 }
150 })
151 }
152
153 /// Get the object ID of the HEAD commit from the underlying Git repository.
154 fn head(&self) -> CargoResult<git2::Oid> {
155 if self.head.get().is_none() {
156 let repo = self.repo()?;
157 let oid = resolve_ref(&self.index_git_ref, repo)?;
158 self.head.set(Some(oid));
159 }
160 Ok(self.head.get().unwrap())
161 }
162
163 /// Returns a [`git2::Tree`] object of the current HEAD commit of the
164 /// underlying Git repository.
165 fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
166 {
167 let tree = self.tree.borrow();
168 if tree.is_some() {
169 return Ok(Ref::map(tree, |s| s.as_ref().unwrap()));
170 }
171 }
172 let repo = self.repo()?;
173 let commit = repo.find_commit(self.head()?)?;
174 let tree = commit.tree()?;
175
176 // SAFETY:
177 // Unfortunately in libgit2 the tree objects look like they've got a
178 // reference to the repository object which means that a tree cannot
179 // outlive the repository that it came from. Here we want to cache this
180 // tree, though, so to accomplish this we transmute it to a static
181 // lifetime.
182 //
183 // Note that we don't actually hand out the static lifetime, instead we
184 // only return a scoped one from this function. Additionally the repo
185 // we loaded from (above) lives as long as this object
186 // (`RemoteRegistry`) so we then just need to ensure that the tree is
187 // destroyed first in the destructor, hence the destructor on
188 // `RemoteRegistry` below.
189 let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) };
190 *self.tree.borrow_mut() = Some(tree);
191 Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
192 }
193
194 /// Gets the current version of the registry index.
195 ///
196 /// It is usually sha of the HEAD commit from the underlying Git repository.
197 fn current_version(&self) -> Option<InternedString> {
198 if let Some(sha) = self.current_sha.get() {
199 return Some(sha);
200 }
201 let sha = InternedString::new(&self.head().ok()?.to_string());
202 self.current_sha.set(Some(sha));
203 Some(sha)
204 }
205
206 /// Whether the registry is up-to-date. See [`Self::mark_updated`] for more.
207 fn is_updated(&self) -> bool {
208 self.gctx.updated_sources().contains(&self.source_id)
209 }
210
211 /// Marks this registry as up-to-date.
212 ///
213 /// This makes sure the index is only updated once per session since it is
214 /// an expensive operation. This generally only happens when the resolver
215 /// is run multiple times, such as during `cargo publish`.
216 fn mark_updated(&self) {
217 self.gctx.updated_sources().insert(self.source_id);
218 }
219}
220
221impl<'gctx> RegistryData for RemoteRegistry<'gctx> {
222 fn prepare(&self) -> CargoResult<()> {
223 self.repo()?;
224 self.gctx
225 .deferred_global_last_use()?
226 .mark_registry_index_used(global_cache_tracker::RegistryIndex {
227 encoded_registry_name: self.name,
228 });
229 Ok(())
230 }
231
232 fn index_path(&self) -> &Filesystem {
233 &self.index_path
234 }
235
236 fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path {
237 self.gctx
238 .assert_package_cache_locked(CacheLockMode::DownloadExclusive, path)
239 }
240
241 /// Read the general concept for `load()` on [`RegistryData::load`].
242 ///
243 /// `index_version` is a string representing the version of the file used
244 /// to construct the cached copy.
245 ///
246 /// Older versions of Cargo used the single value of the hash of the HEAD
247 /// commit as a `index_version`. This is technically correct but a little
248 /// too conservative. If a new commit is fetched all cached files need to
249 /// be regenerated even if a particular file was not changed.
250 ///
251 /// However if an old cargo has written such a file we still know how to
252 /// read it, as long as we check for that hash value.
253 ///
254 /// Cargo now uses a hash of the file's contents as provided by git.
255 fn load(
256 &mut self,
257 _root: &Path,
258 path: &Path,
259 index_version: Option<&str>,
260 ) -> Poll<CargoResult<LoadResponse>> {
261 if self.needs_update {
262 return Poll::Pending;
263 }
264 // Check if the cache is valid.
265 let git_commit_hash = self.current_version();
266 if index_version.is_some() && index_version == git_commit_hash.as_deref() {
267 // This file was written by an old version of cargo, but it is
268 // still up-to-date.
269 return Poll::Ready(Ok(LoadResponse::CacheValid));
270 }
271 // Note that the index calls this method and the filesystem is locked
272 // in the index, so we don't need to worry about an `update_index`
273 // happening in a different process.
274 fn load_helper(
275 registry: &RemoteRegistry<'_>,
276 path: &Path,
277 index_version: Option<&str>,
278 ) -> CargoResult<LoadResponse> {
279 let repo = registry.repo()?;
280 let tree = registry.tree()?;
281 let entry = tree.get_path(path);
282 let entry = entry?;
283 let git_file_hash = Some(entry.id().to_string());
284
285 // Check if the cache is valid.
286 if index_version.is_some() && index_version == git_file_hash.as_deref() {
287 return Ok(LoadResponse::CacheValid);
288 }
289
290 let object = entry.to_object(repo)?;
291 let Some(blob) = object.as_blob() else {
292 anyhow::bail!("path `{}` is not a blob in the git repo", path.display())
293 };
294
295 Ok(LoadResponse::Data {
296 raw_data: blob.content().to_vec(),
297 index_version: git_file_hash,
298 })
299 }
300
301 match load_helper(&self, path, index_version) {
302 Ok(result) => Poll::Ready(Ok(result)),
303 Err(_) if !self.is_updated() => {
304 // If git returns an error and we haven't updated the repo,
305 // return pending to allow an update to try again.
306 self.needs_update = true;
307 Poll::Pending
308 }
309 Err(e)
310 if e.downcast_ref::<git2::Error>()
311 .map(|e| e.code() == git2::ErrorCode::NotFound)
312 .unwrap_or_default() =>
313 {
314 // The repo has been updated and the file does not exist.
315 Poll::Ready(Ok(LoadResponse::NotFound))
316 }
317 Err(e) => Poll::Ready(Err(e)),
318 }
319 }
320
321 fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> {
322 debug!("loading config");
323 self.prepare()?;
324 self.gctx
325 .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
326 match ready!(self.load(Path::new(""), Path::new(RegistryConfig::NAME), None)?) {
327 LoadResponse::Data { raw_data, .. } => {
328 trace!("config loaded");
329 let cfg: RegistryConfig = serde_json::from_slice(&raw_data)?;
330 Poll::Ready(Ok(Some(cfg)))
331 }
332 _ => Poll::Ready(Ok(None)),
333 }
334 }
335
336 fn block_until_ready(&mut self) -> CargoResult<()> {
337 if !self.needs_update {
338 return Ok(());
339 }
340
341 self.needs_update = false;
342
343 if self.is_updated() {
344 return Ok(());
345 }
346 self.mark_updated();
347
348 if !self.gctx.network_allowed() {
349 return Ok(());
350 }
351 if self.gctx.cli_unstable().no_index_update {
352 return Ok(());
353 }
354
355 debug!("updating the index");
356
357 // Ensure that we'll actually be able to acquire an HTTP handle later on
358 // once we start trying to download crates. This will weed out any
359 // problems with `.cargo/config` configuration related to HTTP.
360 //
361 // This way if there's a problem the error gets printed before we even
362 // hit the index, which may not actually read this configuration.
363 self.gctx.http()?;
364
365 self.prepare()?;
366 self.head.set(None);
367 *self.tree.borrow_mut() = None;
368 self.current_sha.set(None);
369 let _path = self
370 .gctx
371 .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path);
372 if !self.quiet {
373 self.gctx
374 .shell()
375 .status("Updating", self.source_id.display_index())?;
376 }
377
378 // Fetch the latest version of our `index_git_ref` into the index
379 // checkout.
380 let url = self.source_id.url();
381 let repo = self.repo.borrow_mut().unwrap();
382 git::fetch(
383 repo,
384 url.as_str(),
385 &self.index_git_ref,
386 self.gctx,
387 RemoteKind::Registry,
388 )
389 .with_context(|| format!("failed to fetch `{}`", url))?;
390
391 Ok(())
392 }
393
394 /// Read the general concept for `invalidate_cache()` on
395 /// [`RegistryData::invalidate_cache`].
396 ///
397 /// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work.
398 fn invalidate_cache(&mut self) {
399 self.needs_update = true;
400 }
401
402 fn set_quiet(&mut self, quiet: bool) {
403 self.quiet = quiet;
404 }
405
406 fn is_updated(&self) -> bool {
407 self.is_updated()
408 }
409
410 fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> {
411 let registry_config = loop {
412 match self.config()? {
413 Poll::Pending => self.block_until_ready()?,
414 Poll::Ready(cfg) => break cfg.unwrap(),
415 }
416 };
417
418 download::download(
419 &self.cache_path,
420 &self.gctx,
421 self.name,
422 pkg,
423 checksum,
424 registry_config,
425 )
426 }
427
428 fn finish_download(
429 &mut self,
430 pkg: PackageId,
431 checksum: &str,
432 data: &[u8],
433 ) -> CargoResult<File> {
434 download::finish_download(
435 &self.cache_path,
436 &self.gctx,
437 self.name.clone(),
438 pkg,
439 checksum,
440 data,
441 )
442 }
443
444 fn is_crate_downloaded(&self, pkg: PackageId) -> bool {
445 download::is_crate_downloaded(&self.cache_path, &self.gctx, pkg)
446 }
447}
448
449/// Implemented to just be sure to drop `tree` field before our other fields.
450/// See SAFETY inside [`RemoteRegistry::tree()`] for more.
451impl<'gctx> Drop for RemoteRegistry<'gctx> {
452 fn drop(&mut self) {
453 self.tree.borrow_mut().take();
454 }
455}