Skip to main content

cargo/sources/git/
source.rs

1//! See [`GitSource`].
2
3use crate::core::GitReference;
4use crate::core::SourceId;
5use crate::core::global_cache_tracker;
6use crate::core::{Dependency, Package, PackageId};
7use crate::sources::IndexSummary;
8use crate::sources::RecursivePathSource;
9use crate::sources::git::utils::GitDatabase;
10use crate::sources::git::utils::GitRemote;
11use crate::sources::git::utils::rev_to_oid;
12use crate::sources::source::MaybePackage;
13use crate::sources::source::QueryKind;
14use crate::sources::source::Source;
15use crate::util::GlobalContext;
16use crate::util::cache_lock::CacheLockMode;
17use crate::util::errors::CargoResult;
18use crate::util::hex::short_hash;
19use crate::util::interning::InternedString;
20use anyhow::Context as _;
21use cargo_util::paths::exclude_from_backups_and_indexing;
22use std::cell::RefCell;
23use std::fmt::{self, Debug, Formatter};
24use tracing::trace;
25use url::Url;
26
27/// `GitSource` contains one or more packages gathering from a Git repository.
28/// Under the hood it uses [`RecursivePathSource`] to discover packages inside the
29/// repository.
30///
31/// ## Filesystem layout
32///
33/// During a successful `GitSource` download, at least two Git repositories are
34/// created: one is the shared Git database of this remote, and the other is the
35/// Git checkout to a specific revision, which contains the actual files to be
36/// compiled. Multiple checkouts can be cloned from a single Git database.
37///
38/// Those repositories are located at Cargo's Git cache directory
39/// `$CARGO_HOME/git`. The file tree of the cache directory roughly looks like:
40///
41/// ```text
42/// $CARGO_HOME/git/
43/// ├── checkouts/
44/// │  ├── gimli-a0d193bd15a5ed96/
45/// │  │  ├── 8e73ef0/     # Git short ID for a certain revision
46/// │  │  ├── a2a4b78/
47/// │  │  └── e33d1ac/
48/// │  ├── log-c58e1db3de7c154d-shallow/
49/// │  │  └── 11eda98/
50/// └── db/
51///    ├── gimli-a0d193bd15a5ed96/
52///    └── log-c58e1db3de7c154d-shallow/
53/// ```
54///
55/// For more on Git cache directory, see ["Cargo Home"] in The Cargo Book.
56///
57/// For more on the directory format `<pkg>-<hash>[-shallow]`, see [`ident`]
58/// and [`ident_shallow`].
59///
60/// ## Locked to a revision
61///
62/// Once a `GitSource` is fetched, it will resolve to a specific commit revision.
63/// This is often mentioned as "locked revision" (`locked_rev`) throughout the
64/// codebase. The revision is written into `Cargo.lock`. This is essential since
65/// we want to ensure a package can compiles with the same set of files when
66/// a `Cargo.lock` is present. With the `locked_rev` provided, `GitSource` can
67/// precisely fetch the same revision from the Git repository.
68///
69/// ["Cargo Home"]: https://doc.rust-lang.org/nightly/cargo/guide/cargo-home.html#directories
70pub struct GitSource<'gctx> {
71    /// The git remote which we're going to fetch from.
72    remote: GitRemote,
73    /// The revision which a git source is locked to.
74    ///
75    /// Expected to always be [`Revision::Locked`] after the Git repository is fetched.
76    locked_rev: RefCell<Revision>,
77    /// The unique identifier of this source.
78    source_id: RefCell<SourceId>,
79    /// The underlying path source to discover packages inside the Git repository.
80    ///
81    /// This gets set to `Some` after the git repo has been checked out
82    /// (automatically handled via [`GitSource::update`]).
83    path_source: RefCell<Option<RecursivePathSource<'gctx>>>,
84    /// A short string that uniquely identifies the version of the checkout.
85    ///
86    /// This is typically a 7-character string of the OID hash, automatically
87    /// increasing in size if it is ambiguous.
88    ///
89    /// This is set to `Some` after the git repo has been checked out
90    /// (automatically handled via [`GitSource::update`]).
91    short_id: RefCell<Option<InternedString>>,
92    /// The identifier of this source for Cargo's Git cache directory.
93    /// See [`ident`] for more.
94    ident: InternedString,
95    gctx: &'gctx GlobalContext,
96    /// Disables status messages.
97    quiet: bool,
98}
99
100impl<'gctx> GitSource<'gctx> {
101    /// Creates a git source for the given [`SourceId`].
102    pub fn new(source_id: SourceId, gctx: &'gctx GlobalContext) -> CargoResult<GitSource<'gctx>> {
103        let remote = GitRemote::new(source_id.url());
104        Self::new_with_remote(source_id, remote, gctx)
105    }
106
107    /// Creates a git source for a submodule with an URL that may not be a valid WHATWG URL.
108    ///
109    /// This is needed because [`SourceId`] hasn't yet supported SCP-like URLs.
110    pub(super) fn new_for_submodule(
111        source_id: SourceId,
112        fetch_url: String,
113        gctx: &'gctx GlobalContext,
114    ) -> CargoResult<GitSource<'gctx>> {
115        let remote = GitRemote::new_from_str(fetch_url);
116        Self::new_with_remote(source_id, remote, gctx)
117    }
118
119    fn new_with_remote(
120        source_id: SourceId,
121        remote: GitRemote,
122        gctx: &'gctx GlobalContext,
123    ) -> CargoResult<GitSource<'gctx>> {
124        assert!(source_id.is_git(), "id is not git, id={}", source_id);
125
126        // Fallback to git ref from manifest if there is no locked revision.
127        let locked_rev = source_id
128            .precise_git_fragment()
129            .map(|s| Revision::new(s.into()))
130            .unwrap_or_else(|| source_id.git_reference().unwrap().clone().into());
131
132        let ident = ident_shallow(
133            &source_id,
134            gctx.cli_unstable()
135                .git
136                .map_or(false, |features| features.shallow_deps),
137        );
138
139        let source = GitSource {
140            remote,
141            locked_rev: RefCell::new(locked_rev),
142            source_id: RefCell::new(source_id),
143            path_source: RefCell::new(None),
144            short_id: RefCell::new(None),
145            ident: ident.into(),
146            gctx,
147            quiet: false,
148        };
149
150        Ok(source)
151    }
152
153    /// Gets the remote repository URL.
154    pub fn url(&self) -> Url {
155        self.source_id.borrow().url().clone()
156    }
157
158    /// Returns the packages discovered by this source. It may fetch the Git
159    /// repository as well as walk the filesystem if package information
160    /// haven't yet updated.
161    pub fn read_packages(&mut self) -> CargoResult<Vec<Package>> {
162        if self.path_source.borrow().is_none() {
163            self.invalidate_cache();
164            self.update()?;
165        }
166        self.path_source
167            .borrow_mut()
168            .as_mut()
169            .unwrap()
170            .read_packages()
171    }
172
173    fn mark_used(&self) -> CargoResult<()> {
174        self.gctx
175            .deferred_global_last_use()?
176            .mark_git_checkout_used(global_cache_tracker::GitCheckout {
177                encoded_git_name: self.ident,
178                short_name: self.short_id.borrow().expect("update before download"),
179                size: None,
180            });
181        Ok(())
182    }
183
184    /// Fetch and return a [`GitDatabase`] with the resolved revision
185    /// for this source,
186    ///
187    /// This won't fetch anything if the required revision is
188    /// already available locally.
189    pub(crate) fn fetch_db(&self, is_submodule: bool) -> CargoResult<(GitDatabase, git2::Oid)> {
190        let db_path = self.gctx.git_db_path().join(&self.ident);
191        let db_path = db_path.into_path_unlocked();
192
193        let db = self.remote.db_at(&db_path).ok();
194
195        let (db, actual_rev) = match (&*self.locked_rev.borrow(), db) {
196            // If we have a locked revision, and we have a preexisting database
197            // which has that revision, then no update needs to happen.
198            (Revision::Locked(oid), Some(db)) if db.contains(*oid) => (db, *oid),
199
200            // If we're in offline mode, we're not locked, and we have a
201            // database, then try to resolve our reference with the preexisting
202            // repository.
203            (Revision::Deferred(git_ref), Some(db)) if !self.gctx.network_allowed() => {
204                let offline_flag = self
205                    .gctx
206                    .offline_flag()
207                    .expect("always present when `!network_allowed`");
208                let rev = db.resolve(&git_ref).with_context(|| {
209                    format!(
210                        "failed to lookup reference in preexisting repository, and \
211                         can't check for updates in offline mode ({offline_flag})"
212                    )
213                })?;
214                (db, rev)
215            }
216
217            // ... otherwise we use this state to update the git database. Note
218            // that we still check for being offline here, for example in the
219            // situation that we have a locked revision but the database
220            // doesn't have it.
221            (locked_rev, db) => {
222                if let Some(offline_flag) = self.gctx.offline_flag() {
223                    anyhow::bail!(
224                        "can't checkout from '{}': you are in the offline mode ({offline_flag})",
225                        self.remote.url()
226                    );
227                }
228
229                if !self.quiet {
230                    let scope = if is_submodule {
231                        "submodule"
232                    } else {
233                        "repository"
234                    };
235                    self.gctx
236                        .shell()
237                        .status("Updating", format!("git {scope} `{}`", self.remote.url()))?;
238                }
239
240                trace!("updating git source `{:?}`", self.remote);
241
242                let locked_rev = locked_rev.clone().into();
243                let manifest_reference = self.source_id.borrow().git_reference().unwrap();
244                self.remote
245                    .checkout(&db_path, db, manifest_reference, &locked_rev, self.gctx)?
246            }
247        };
248        Ok((db, actual_rev))
249    }
250
251    fn update(&self) -> CargoResult<()> {
252        if self.path_source.borrow().is_some() {
253            self.mark_used()?;
254            return Ok(());
255        }
256
257        let git_fs = self.gctx.git_path();
258        // Ignore errors creating it, in case this is a read-only filesystem:
259        // perhaps the later operations can succeed anyhow.
260        let _ = git_fs.create_dir();
261        let git_path = self
262            .gctx
263            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &git_fs);
264
265        // Before getting a checkout, make sure that `<cargo_home>/git` is
266        // marked as excluded from indexing and backups. Older versions of Cargo
267        // didn't do this, so we do it here regardless of whether `<cargo_home>`
268        // exists.
269        //
270        // This does not use `create_dir_all_excluded_from_backups_atomic` for
271        // the same reason: we want to exclude it even if the directory already
272        // exists.
273        exclude_from_backups_and_indexing(&git_path);
274
275        let (db, actual_rev) = self.fetch_db(false)?;
276
277        // Don’t use the full hash, in order to contribute less to reaching the
278        // path length limit on Windows. See
279        // <https://github.com/servo/servo/pull/14397>.
280        let short_id = db.to_short_id(actual_rev)?;
281
282        // Check out `actual_rev` from the database to a scoped location on the
283        // filesystem. This will use hard links and such to ideally make the
284        // checkout operation here pretty fast.
285        let checkout_path = self
286            .gctx
287            .git_checkouts_path()
288            .join(&self.ident)
289            .join(short_id.as_str());
290        let checkout_path = checkout_path.into_path_unlocked();
291        db.copy_to(actual_rev, &checkout_path, self.gctx, self.quiet)?;
292
293        let source_id = self
294            .source_id
295            .borrow()
296            .with_git_precise(Some(actual_rev.to_string()));
297        let path_source = RecursivePathSource::new(&checkout_path, source_id, self.gctx);
298
299        self.path_source.replace(Some(path_source));
300        self.short_id.replace(Some(short_id.as_str().into()));
301        self.locked_rev.replace(Revision::Locked(actual_rev));
302        self.path_source.borrow().as_ref().unwrap().load()?;
303
304        self.mark_used()?;
305        Ok(())
306    }
307}
308
309/// Indicates a [Git revision] that might be locked or deferred to be resolved.
310///
311/// [Git revision]: https://git-scm.com/docs/revisions
312#[derive(Clone, Debug)]
313enum Revision {
314    /// A [Git reference] that would trigger extra fetches when being resolved.
315    ///
316    /// [Git reference]: https://git-scm.com/book/en/v2/Git-Internals-Git-References
317    Deferred(GitReference),
318    /// A locked revision of the actual Git commit object ID.
319    Locked(git2::Oid),
320}
321
322impl Revision {
323    fn new(rev: &str) -> Revision {
324        match rev_to_oid(rev) {
325            Some(oid) => Revision::Locked(oid),
326            None => Revision::Deferred(GitReference::Rev(rev.to_string())),
327        }
328    }
329}
330
331impl From<GitReference> for Revision {
332    fn from(value: GitReference) -> Self {
333        Revision::Deferred(value)
334    }
335}
336
337impl From<Revision> for GitReference {
338    fn from(value: Revision) -> Self {
339        match value {
340            Revision::Deferred(git_ref) => git_ref,
341            Revision::Locked(oid) => GitReference::Rev(oid.to_string()),
342        }
343    }
344}
345
346/// Create an identifier from a URL,
347/// essentially turning `proto://host/path/repo` into `repo-<hash-of-url>`.
348fn ident(id: &SourceId) -> String {
349    let ident = id
350        .canonical_url()
351        .raw_canonicalized_url()
352        .path_segments()
353        .and_then(|s| s.rev().next())
354        .unwrap_or("");
355
356    let ident = if ident.is_empty() { "_empty" } else { ident };
357
358    format!("{}-{}", ident, short_hash(id.canonical_url()))
359}
360
361/// Like [`ident()`], but appends `-shallow` to it, turning
362/// `proto://host/path/repo` into `repo-<hash-of-url>-shallow`.
363///
364/// It's important to separate shallow from non-shallow clones for reasons of
365/// backwards compatibility --- older cargo's aren't necessarily handling
366/// shallow clones correctly.
367fn ident_shallow(id: &SourceId, is_shallow: bool) -> String {
368    let mut ident = ident(id);
369    if is_shallow {
370        ident.push_str("-shallow");
371    }
372    ident
373}
374
375impl<'gctx> Debug for GitSource<'gctx> {
376    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
377        write!(f, "git repo at {}", self.source_id.borrow().url())?;
378        match &*self.locked_rev.borrow() {
379            Revision::Deferred(git_ref) => match git_ref.pretty_ref(true) {
380                Some(s) => write!(f, " ({})", s),
381                None => Ok(()),
382            },
383            Revision::Locked(oid) => write!(f, " ({oid})"),
384        }
385    }
386}
387
388#[async_trait::async_trait(?Send)]
389impl<'gctx> Source for GitSource<'gctx> {
390    async fn query(
391        &self,
392        dep: &Dependency,
393        kind: QueryKind,
394        f: &mut dyn FnMut(IndexSummary),
395    ) -> CargoResult<()> {
396        if self.path_source.borrow().is_none() {
397            self.update()?;
398        }
399        let src = self.path_source.borrow();
400        let src = src.as_ref().unwrap();
401        src.query(dep, kind, f).await
402    }
403
404    fn supports_checksums(&self) -> bool {
405        false
406    }
407
408    fn requires_precise(&self) -> bool {
409        true
410    }
411
412    fn source_id(&self) -> SourceId {
413        *self.source_id.borrow()
414    }
415
416    fn download(&self, id: PackageId) -> CargoResult<MaybePackage> {
417        trace!(
418            "getting packages for package ID `{}` from `{:?}`",
419            id, self.remote
420        );
421        self.mark_used()?;
422        self.path_source
423            .borrow_mut()
424            .as_mut()
425            .expect("BUG: `update()` must be called before `get()`")
426            .download(id)
427    }
428
429    fn finish_download(&self, _id: PackageId, _data: Vec<u8>) -> CargoResult<Package> {
430        panic!("no download should have started")
431    }
432
433    fn fingerprint(&self, _pkg: &Package) -> CargoResult<String> {
434        match &*self.locked_rev.borrow() {
435            Revision::Locked(oid) => Ok(oid.to_string()),
436            _ => unreachable!("locked_rev must be resolved when computing fingerprint"),
437        }
438    }
439
440    fn describe(&self) -> String {
441        format!("Git repository {}", self.source_id.borrow())
442    }
443
444    fn add_to_yanked_whitelist(&self, _pkgs: &[PackageId]) {}
445
446    async fn is_yanked(&self, _pkg: PackageId) -> CargoResult<bool> {
447        Ok(false)
448    }
449
450    fn invalidate_cache(&self) {}
451
452    fn set_quiet(&mut self, quiet: bool) {
453        self.quiet = quiet;
454    }
455}
456
457#[cfg(test)]
458mod test {
459    use super::ident;
460    use crate::core::{GitReference, SourceId};
461    use crate::util::IntoUrl;
462
463    #[test]
464    pub fn test_url_to_path_ident_with_path() {
465        let ident = ident(&src("https://github.com/carlhuda/cargo"));
466        assert!(ident.starts_with("cargo-"));
467    }
468
469    #[test]
470    pub fn test_url_to_path_ident_without_path() {
471        let ident = ident(&src("https://github.com"));
472        assert!(ident.starts_with("_empty-"));
473    }
474
475    #[test]
476    fn test_canonicalize_idents_by_stripping_trailing_url_slash() {
477        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston/"));
478        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston"));
479        assert_eq!(ident1, ident2);
480    }
481
482    #[test]
483    fn test_canonicalize_idents_by_lowercasing_github_urls() {
484        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
485        let ident2 = ident(&src("https://github.com/pistondevelopers/piston"));
486        assert_eq!(ident1, ident2);
487    }
488
489    #[test]
490    fn test_canonicalize_idents_by_stripping_dot_git() {
491        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
492        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston.git"));
493        assert_eq!(ident1, ident2);
494    }
495
496    #[test]
497    fn test_canonicalize_idents_different_protocols() {
498        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
499        let ident2 = ident(&src("git://github.com/PistonDevelopers/piston"));
500        assert_eq!(ident1, ident2);
501    }
502
503    fn src(s: &str) -> SourceId {
504        SourceId::for_git(&s.into_url().unwrap(), GitReference::DefaultBranch).unwrap()
505    }
506}