cargo/sources/git/
source.rs

1//! See [`GitSource`].
2
3use crate::core::global_cache_tracker;
4use crate::core::GitReference;
5use crate::core::SourceId;
6use crate::core::{Dependency, Package, PackageId};
7use crate::sources::git::utils::rev_to_oid;
8use crate::sources::git::utils::GitRemote;
9use crate::sources::source::MaybePackage;
10use crate::sources::source::QueryKind;
11use crate::sources::source::Source;
12use crate::sources::IndexSummary;
13use crate::sources::RecursivePathSource;
14use crate::util::cache_lock::CacheLockMode;
15use crate::util::errors::CargoResult;
16use crate::util::hex::short_hash;
17use crate::util::interning::InternedString;
18use crate::util::GlobalContext;
19use anyhow::Context as _;
20use cargo_util::paths::exclude_from_backups_and_indexing;
21use std::fmt::{self, Debug, Formatter};
22use std::task::Poll;
23use tracing::trace;
24use url::Url;
25
26/// `GitSource` contains one or more packages gathering from a Git repository.
27/// Under the hood it uses [`RecursivePathSource`] to discover packages inside the
28/// repository.
29///
30/// ## Filesystem layout
31///
32/// During a successful `GitSource` download, at least two Git repositories are
33/// created: one is the shared Git database of this remote, and the other is the
34/// Git checkout to a specific revision, which contains the actual files to be
35/// compiled. Multiple checkouts can be cloned from a single Git database.
36///
37/// Those repositories are located at Cargo's Git cache directory
38/// `$CARGO_HOME/git`. The file tree of the cache directory roughly looks like:
39///
40/// ```text
41/// $CARGO_HOME/git/
42/// ├── checkouts/
43/// │  ├── gimli-a0d193bd15a5ed96/
44/// │  │  ├── 8e73ef0/     # Git short ID for a certain revision
45/// │  │  ├── a2a4b78/
46/// │  │  └── e33d1ac/
47/// │  ├── log-c58e1db3de7c154d-shallow/
48/// │  │  └── 11eda98/
49/// └── db/
50///    ├── gimli-a0d193bd15a5ed96/
51///    └── log-c58e1db3de7c154d-shallow/
52/// ```
53///
54/// For more on Git cache directory, see ["Cargo Home"] in The Cargo Book.
55///
56/// For more on the directory format `<pkg>-<hash>[-shallow]`, see [`ident`]
57/// and [`ident_shallow`].
58///
59/// ## Locked to a revision
60///
61/// Once a `GitSource` is fetched, it will resolve to a specific commit revision.
62/// This is often mentioned as "locked revision" (`locked_rev`) throughout the
63/// codebase. The revision is written into `Cargo.lock`. This is essential since
64/// we want to ensure a package can compiles with the same set of files when
65/// a `Cargo.lock` is present. With the `locked_rev` provided, `GitSource` can
66/// precisely fetch the same revision from the Git repository.
67///
68/// ["Cargo Home"]: https://doc.rust-lang.org/nightly/cargo/guide/cargo-home.html#directories
69pub struct GitSource<'gctx> {
70    /// The git remote which we're going to fetch from.
71    remote: GitRemote,
72    /// The revision which a git source is locked to.
73    ///
74    /// Expected to always be [`Revision::Locked`] after the Git repository is fetched.
75    locked_rev: Revision,
76    /// The unique identifier of this source.
77    source_id: SourceId,
78    /// The underlying path source to discover packages inside the Git repository.
79    ///
80    /// This gets set to `Some` after the git repo has been checked out
81    /// (automatically handled via [`GitSource::block_until_ready`]).
82    path_source: Option<RecursivePathSource<'gctx>>,
83    /// A short string that uniquely identifies the version of the checkout.
84    ///
85    /// This is typically a 7-character string of the OID hash, automatically
86    /// increasing in size if it is ambiguous.
87    ///
88    /// This is set to `Some` after the git repo has been checked out
89    /// (automatically handled via [`GitSource::block_until_ready`]).
90    short_id: Option<InternedString>,
91    /// The identifier of this source for Cargo's Git cache directory.
92    /// See [`ident`] for more.
93    ident: InternedString,
94    gctx: &'gctx GlobalContext,
95    /// Disables status messages.
96    quiet: bool,
97}
98
99impl<'gctx> GitSource<'gctx> {
100    /// Creates a git source for the given [`SourceId`].
101    pub fn new(source_id: SourceId, gctx: &'gctx GlobalContext) -> CargoResult<GitSource<'gctx>> {
102        assert!(source_id.is_git(), "id is not git, id={}", source_id);
103
104        let remote = GitRemote::new(source_id.url());
105        // Fallback to git ref from manifest if there is no locked revision.
106        let locked_rev = source_id
107            .precise_git_fragment()
108            .map(|s| Revision::new(s.into()))
109            .unwrap_or_else(|| source_id.git_reference().unwrap().clone().into());
110
111        let ident = ident_shallow(
112            &source_id,
113            gctx.cli_unstable()
114                .git
115                .map_or(false, |features| features.shallow_deps),
116        );
117
118        let source = GitSource {
119            remote,
120            locked_rev,
121            source_id,
122            path_source: None,
123            short_id: None,
124            ident: ident.into(),
125            gctx,
126            quiet: false,
127        };
128
129        Ok(source)
130    }
131
132    /// Gets the remote repository URL.
133    pub fn url(&self) -> &Url {
134        self.remote.url()
135    }
136
137    /// Returns the packages discovered by this source. It may fetch the Git
138    /// repository as well as walk the filesystem if package information
139    /// haven't yet updated.
140    pub fn read_packages(&mut self) -> CargoResult<Vec<Package>> {
141        if self.path_source.is_none() {
142            self.invalidate_cache();
143            self.block_until_ready()?;
144        }
145        self.path_source.as_mut().unwrap().read_packages()
146    }
147
148    fn mark_used(&self) -> CargoResult<()> {
149        self.gctx
150            .deferred_global_last_use()?
151            .mark_git_checkout_used(global_cache_tracker::GitCheckout {
152                encoded_git_name: self.ident,
153                short_name: self.short_id.expect("update before download"),
154                size: None,
155            });
156        Ok(())
157    }
158}
159
160/// Indicates a [Git revision] that might be locked or deferred to be resolved.
161///
162/// [Git revision]: https://git-scm.com/docs/revisions
163#[derive(Clone, Debug)]
164enum Revision {
165    /// A [Git reference] that would trigger extra fetches when being resolved.
166    ///
167    /// [Git reference]: https://git-scm.com/book/en/v2/Git-Internals-Git-References
168    Deferred(GitReference),
169    /// A locked revision of the actual Git commit object ID.
170    Locked(git2::Oid),
171}
172
173impl Revision {
174    fn new(rev: &str) -> Revision {
175        match rev_to_oid(rev) {
176            Some(oid) => Revision::Locked(oid),
177            None => Revision::Deferred(GitReference::Rev(rev.to_string())),
178        }
179    }
180}
181
182impl From<GitReference> for Revision {
183    fn from(value: GitReference) -> Self {
184        Revision::Deferred(value)
185    }
186}
187
188impl From<Revision> for GitReference {
189    fn from(value: Revision) -> Self {
190        match value {
191            Revision::Deferred(git_ref) => git_ref,
192            Revision::Locked(oid) => GitReference::Rev(oid.to_string()),
193        }
194    }
195}
196
197/// Create an identifier from a URL,
198/// essentially turning `proto://host/path/repo` into `repo-<hash-of-url>`.
199fn ident(id: &SourceId) -> String {
200    let ident = id
201        .canonical_url()
202        .raw_canonicalized_url()
203        .path_segments()
204        .and_then(|s| s.rev().next())
205        .unwrap_or("");
206
207    let ident = if ident.is_empty() { "_empty" } else { ident };
208
209    format!("{}-{}", ident, short_hash(id.canonical_url()))
210}
211
212/// Like [`ident()`], but appends `-shallow` to it, turning
213/// `proto://host/path/repo` into `repo-<hash-of-url>-shallow`.
214///
215/// It's important to separate shallow from non-shallow clones for reasons of
216/// backwards compatibility --- older cargo's aren't necessarily handling
217/// shallow clones correctly.
218fn ident_shallow(id: &SourceId, is_shallow: bool) -> String {
219    let mut ident = ident(id);
220    if is_shallow {
221        ident.push_str("-shallow");
222    }
223    ident
224}
225
226impl<'gctx> Debug for GitSource<'gctx> {
227    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
228        write!(f, "git repo at {}", self.remote.url())?;
229        match &self.locked_rev {
230            Revision::Deferred(git_ref) => match git_ref.pretty_ref(true) {
231                Some(s) => write!(f, " ({})", s),
232                None => Ok(()),
233            },
234            Revision::Locked(oid) => write!(f, " ({oid})"),
235        }
236    }
237}
238
239impl<'gctx> Source for GitSource<'gctx> {
240    fn query(
241        &mut self,
242        dep: &Dependency,
243        kind: QueryKind,
244        f: &mut dyn FnMut(IndexSummary),
245    ) -> Poll<CargoResult<()>> {
246        if let Some(src) = self.path_source.as_mut() {
247            src.query(dep, kind, f)
248        } else {
249            Poll::Pending
250        }
251    }
252
253    fn supports_checksums(&self) -> bool {
254        false
255    }
256
257    fn requires_precise(&self) -> bool {
258        true
259    }
260
261    fn source_id(&self) -> SourceId {
262        self.source_id
263    }
264
265    fn block_until_ready(&mut self) -> CargoResult<()> {
266        if self.path_source.is_some() {
267            self.mark_used()?;
268            return Ok(());
269        }
270
271        let git_fs = self.gctx.git_path();
272        // Ignore errors creating it, in case this is a read-only filesystem:
273        // perhaps the later operations can succeed anyhow.
274        let _ = git_fs.create_dir();
275        let git_path = self
276            .gctx
277            .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &git_fs);
278
279        // Before getting a checkout, make sure that `<cargo_home>/git` is
280        // marked as excluded from indexing and backups. Older versions of Cargo
281        // didn't do this, so we do it here regardless of whether `<cargo_home>`
282        // exists.
283        //
284        // This does not use `create_dir_all_excluded_from_backups_atomic` for
285        // the same reason: we want to exclude it even if the directory already
286        // exists.
287        exclude_from_backups_and_indexing(&git_path);
288
289        let db_path = self.gctx.git_db_path().join(&self.ident);
290        let db_path = db_path.into_path_unlocked();
291
292        let db = self.remote.db_at(&db_path).ok();
293
294        let (db, actual_rev) = match (&self.locked_rev, db) {
295            // If we have a locked revision, and we have a preexisting database
296            // which has that revision, then no update needs to happen.
297            (Revision::Locked(oid), Some(db)) if db.contains(*oid) => (db, *oid),
298
299            // If we're in offline mode, we're not locked, and we have a
300            // database, then try to resolve our reference with the preexisting
301            // repository.
302            (Revision::Deferred(git_ref), Some(db)) if self.gctx.offline() => {
303                let rev = db.resolve(&git_ref).with_context(|| {
304                    "failed to lookup reference in preexisting repository, and \
305                         can't check for updates in offline mode (--offline)"
306                })?;
307                (db, rev)
308            }
309
310            // ... otherwise we use this state to update the git database. Note
311            // that we still check for being offline here, for example in the
312            // situation that we have a locked revision but the database
313            // doesn't have it.
314            (locked_rev, db) => {
315                if self.gctx.offline() {
316                    anyhow::bail!(
317                        "can't checkout from '{}': you are in the offline mode (--offline)",
318                        self.remote.url()
319                    );
320                }
321
322                if !self.quiet {
323                    self.gctx.shell().status(
324                        "Updating",
325                        format!("git repository `{}`", self.remote.url()),
326                    )?;
327                }
328
329                trace!("updating git source `{:?}`", self.remote);
330
331                let locked_rev = locked_rev.clone().into();
332                self.remote.checkout(&db_path, db, &locked_rev, self.gctx)?
333            }
334        };
335
336        // Don’t use the full hash, in order to contribute less to reaching the
337        // path length limit on Windows. See
338        // <https://github.com/servo/servo/pull/14397>.
339        let short_id = db.to_short_id(actual_rev)?;
340
341        // Check out `actual_rev` from the database to a scoped location on the
342        // filesystem. This will use hard links and such to ideally make the
343        // checkout operation here pretty fast.
344        let checkout_path = self
345            .gctx
346            .git_checkouts_path()
347            .join(&self.ident)
348            .join(short_id.as_str());
349        let checkout_path = checkout_path.into_path_unlocked();
350        db.copy_to(actual_rev, &checkout_path, self.gctx)?;
351
352        let source_id = self
353            .source_id
354            .with_git_precise(Some(actual_rev.to_string()));
355        let path_source = RecursivePathSource::new(&checkout_path, source_id, self.gctx);
356
357        self.path_source = Some(path_source);
358        self.short_id = Some(short_id.as_str().into());
359        self.locked_rev = Revision::Locked(actual_rev);
360        self.path_source.as_mut().unwrap().load()?;
361
362        self.mark_used()?;
363        Ok(())
364    }
365
366    fn download(&mut self, id: PackageId) -> CargoResult<MaybePackage> {
367        trace!(
368            "getting packages for package ID `{}` from `{:?}`",
369            id,
370            self.remote
371        );
372        self.mark_used()?;
373        self.path_source
374            .as_mut()
375            .expect("BUG: `update()` must be called before `get()`")
376            .download(id)
377    }
378
379    fn finish_download(&mut self, _id: PackageId, _data: Vec<u8>) -> CargoResult<Package> {
380        panic!("no download should have started")
381    }
382
383    fn fingerprint(&self, _pkg: &Package) -> CargoResult<String> {
384        match &self.locked_rev {
385            Revision::Locked(oid) => Ok(oid.to_string()),
386            _ => unreachable!("locked_rev must be resolved when computing fingerprint"),
387        }
388    }
389
390    fn describe(&self) -> String {
391        format!("Git repository {}", self.source_id)
392    }
393
394    fn add_to_yanked_whitelist(&mut self, _pkgs: &[PackageId]) {}
395
396    fn is_yanked(&mut self, _pkg: PackageId) -> Poll<CargoResult<bool>> {
397        Poll::Ready(Ok(false))
398    }
399
400    fn invalidate_cache(&mut self) {}
401
402    fn set_quiet(&mut self, quiet: bool) {
403        self.quiet = quiet;
404    }
405}
406
407#[cfg(test)]
408mod test {
409    use super::ident;
410    use crate::core::{GitReference, SourceId};
411    use crate::util::IntoUrl;
412
413    #[test]
414    pub fn test_url_to_path_ident_with_path() {
415        let ident = ident(&src("https://github.com/carlhuda/cargo"));
416        assert!(ident.starts_with("cargo-"));
417    }
418
419    #[test]
420    pub fn test_url_to_path_ident_without_path() {
421        let ident = ident(&src("https://github.com"));
422        assert!(ident.starts_with("_empty-"));
423    }
424
425    #[test]
426    fn test_canonicalize_idents_by_stripping_trailing_url_slash() {
427        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston/"));
428        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston"));
429        assert_eq!(ident1, ident2);
430    }
431
432    #[test]
433    fn test_canonicalize_idents_by_lowercasing_github_urls() {
434        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
435        let ident2 = ident(&src("https://github.com/pistondevelopers/piston"));
436        assert_eq!(ident1, ident2);
437    }
438
439    #[test]
440    fn test_canonicalize_idents_by_stripping_dot_git() {
441        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
442        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston.git"));
443        assert_eq!(ident1, ident2);
444    }
445
446    #[test]
447    fn test_canonicalize_idents_different_protocols() {
448        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
449        let ident2 = ident(&src("git://github.com/PistonDevelopers/piston"));
450        assert_eq!(ident1, ident2);
451    }
452
453    fn src(s: &str) -> SourceId {
454        SourceId::for_git(&s.into_url().unwrap(), GitReference::DefaultBranch).unwrap()
455    }
456}