From 0f0e286c60c745438bd3a709319e786dda11ab87 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 24 Jun 2026 13:06:28 +0200 Subject: [PATCH] feat(gitutils): add `parse_vcs_url` for pip VCS URL parsing Add `parse_vcs_url` to parse pip VCS URLs (git+https, git+ssh) into a repo clone URL and git ref. Use it in `bootstrapper` and `sources` to replace duplicated manual git+ URL parsing logic. Co-Authored-By: Claude Signed-off-by: Christian Heimes --- src/fromager/bootstrapper.py | 33 +++++++----------------- src/fromager/gitutils.py | 50 +++++++++++++++++++++++++++++++++++- src/fromager/sources.py | 24 +---------------- tests/test_bootstrap.py | 2 +- tests/test_gitutils.py | 32 +++++++++++++++++++++-- tests/test_sources.py | 2 +- 6 files changed, 91 insertions(+), 52 deletions(-) diff --git a/src/fromager/bootstrapper.py b/src/fromager/bootstrapper.py index c2de0c6d9..281922724 100644 --- a/src/fromager/bootstrapper.py +++ b/src/fromager/bootstrapper.py @@ -26,6 +26,7 @@ build_environment, dependencies, finders, + gitutils, hooks, progress, resolver, @@ -1067,40 +1068,24 @@ def _resolve_version_from_git_url(self, req: Requirement) -> tuple[str, Version] if not req.url: raise ValueError(f"unable to resolve from URL with no URL in {req}") - if not req.url.startswith("git+"): - raise ValueError(f"unable to handle URL scheme in {req.url} from {req}") - # We start by not knowing where we would put the source because we don't # know the version. working_src_dir: pathlib.Path | None = None version: Version | None = None - # Clean up the URL so we can parse it - reduced_url = req.url[len("git+") :] - parsed_url = urlparse(reduced_url) - - # Save the URL that we think we will use for cloning. This might change - # later if the path has a tag or branch in it. - url_to_clone = reduced_url + url_to_clone, git_ref = gitutils.parse_vcs_url(req.url, require_ref=False) need_to_clone = False - # If the URL includes an @ with text after it, we use that as the reference - # to clone, but by default we take the default branch. - git_ref: str | None = None - - if "@" not in parsed_url.path: - # If we have no reference, we know we are going to have to clone the - # repository to figure out the version to use. + if git_ref == gitutils.GIT_HEAD: + # No ref in URL, clone to discover the version. logger.debug("no reference in URL, will clone") need_to_clone = True else: - # If we have a reference, it might be a valid python version string, or - # not. It _must_ be a valid git reference. If it can be parsed as a - # valid python version, we assume the tag points to source that will - # think that is its version, so we allow reusing an existing cloned repo - # if there is one. - new_path, _, git_ref = parsed_url.path.rpartition("@") - url_to_clone = parsed_url._replace(path=new_path).geturl() + # If we have a reference, it might be a valid python version + # string, or not. It _must_ be a valid git reference. If it can + # be parsed as a valid python version, we assume the tag points + # to source that will think that is its version, so we allow + # reusing an existing cloned repo if there is one. try: version = Version(git_ref) except ValueError: diff --git a/src/fromager/gitutils.py b/src/fromager/gitutils.py index d23f630d7..2677f5024 100644 --- a/src/fromager/gitutils.py +++ b/src/fromager/gitutils.py @@ -1,5 +1,6 @@ import logging import pathlib +import typing from urllib.parse import urlparse from packaging.requirements import Requirement @@ -8,6 +9,9 @@ logger = logging.getLogger(__name__) +#: Default git ref (current HEAD of the default branch) +GIT_HEAD: typing.Final = "HEAD" + def git_clone( *, @@ -67,7 +71,7 @@ def git_clone_fast( *, output_dir: pathlib.Path, repo_url: str, - ref: str = "HEAD", + ref: str = GIT_HEAD, ) -> None: """Efficient, blobless git clone with all submodules @@ -134,3 +138,47 @@ def git_clone_fast( external_commands.run(cmd, cwd=str(output_dir), network_isolation=False) else: logger.debug("no .gitmodules file") + + +def parse_vcs_url(vcs_url: str, *, require_ref: bool = True) -> tuple[str, str]: + """Parse a `pip VCS URL`_ into a repo clone URL and git ref. + + Parses a string like ``git+https://github.com/python-wheel-build/fromager.git@refs/tags/0.88.0`` + into a git clone URL and git reference. + + When *require_ref* is ``False`` and the URL has no ``@ref``, + the ref defaults to `GIT_HEAD` instead of raising an error. + + .. _pip VCS URL: https://pip.pypa.io/en/stable/topics/vcs-support/ + + .. versionadded:: 0.89.0 + """ + parsed = urlparse(vcs_url) + + supported_schemes: set[str] = {"git+https", "git+ssh"} + if parsed.scheme not in supported_schemes: + raise ValueError( + f"unsupported VCS URL scheme {parsed.scheme!r}," + f" expected one of {sorted(supported_schemes)}" + ) + + # The ref is appended to the path after the last '@'. + path_before, sep, ref = parsed.path.rpartition("@") + if sep and not ref: + raise ValueError(f"VCS URL {vcs_url!r} has an empty ref after '@'") + if not sep: + if require_ref: + raise ValueError( + f"VCS URL {vcs_url!r} is missing a mandatory ref after '@'" + ) + path_before = parsed.path + ref = GIT_HEAD + + # Reconstruct the plain repo URL without the git+ prefix and ref + repo_url = parsed._replace( + scheme=parsed.scheme[4:], + path=path_before, + fragment="", + ).geturl() + + return repo_url, ref diff --git a/src/fromager/sources.py b/src/fromager/sources.py index bd308adbd..acd2e74b9 100644 --- a/src/fromager/sources.py +++ b/src/fromager/sources.py @@ -8,7 +8,6 @@ import tarfile import typing import zipfile -from urllib.parse import urlparse import resolvelib from packaging.requirements import Requirement @@ -77,25 +76,7 @@ def download_source( download_path = ctx.work_dir / f"{req.name}-{version}" / f"{req.name}-{version}" download_path.mkdir(parents=True, exist_ok=True) - # Parse ref from URL if it's in the form repo@ref - url_to_clone = req.url - git_ref: str | None = None - - # Remove git+ prefix for parsing - parsing_url = url_to_clone - if parsing_url.startswith("git+"): - parsing_url = parsing_url[len("git+") :] - - parsed_url = urlparse(parsing_url) - if "@" in parsed_url.path: - # Extract the ref from the path (e.g., /path/to/repo.git@5.2.0 -> @5.2.0) - new_path, _, git_ref = parsed_url.path.rpartition("@") - # Reconstruct URL without the @ref part - url_to_clone = parsed_url._replace(path=new_path).geturl() - # Add back git+ prefix if it was present - if req.url.startswith("git+"): - url_to_clone = f"git+{url_to_clone}" - + url_to_clone, git_ref = gitutils.parse_vcs_url(req.url) download_git_source( ctx=ctx, req=req, @@ -239,9 +220,6 @@ def download_git_source( destination_dir: pathlib.Path, ref: str | None = None, ) -> None: - if url_to_clone.startswith("git+"): - url_to_clone = url_to_clone[len("git+") :] - logger.info(f"cloning source from {url_to_clone}@{ref} to {destination_dir}") # Get git options from package settings pbi = ctx.package_build_info(req) diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py index 94b98c0e8..f97526c86 100644 --- a/tests/test_bootstrap.py +++ b/tests/test_bootstrap.py @@ -884,5 +884,5 @@ def test_resolve_version_from_git_url_invalid_scheme( req = Requirement("test-pkg @ https://github.com/example/repo.git") bs = bootstrapper.Bootstrapper(tmp_context) - with pytest.raises(ValueError, match="unable to handle URL scheme"): + with pytest.raises(ValueError, match="unsupported VCS URL scheme"): bs._resolve_version_from_git_url(req) diff --git a/tests/test_gitutils.py b/tests/test_gitutils.py index 2c40f3f36..1db1bd86a 100644 --- a/tests/test_gitutils.py +++ b/tests/test_gitutils.py @@ -7,7 +7,7 @@ import pytest from packaging.version import Version -from fromager.gitutils import git_clone_fast +from fromager.gitutils import GIT_HEAD, git_clone_fast, parse_vcs_url needs_git_command = pytest.mark.skipif( shutil.which("git") is None, reason="requires 'git' command" @@ -47,7 +47,7 @@ def test_git_clone_fast(m_run: Mock, tmp_path: pathlib.Path) -> None: [ "git", "checkout", - "HEAD", + GIT_HEAD, ], network_isolation=False, cwd=str(tmp_path), @@ -76,6 +76,34 @@ def test_git_clone_fast_submodules(m_run: Mock, tmp_path: pathlib.Path) -> None: ) +def test_parse_vcs_url() -> None: + assert parse_vcs_url("git+https://git.test/org/project.git@v1.0") == ( + "https://git.test/org/project.git", + "v1.0", + ) + # '@' in netloc must not be confused with the ref '@' + assert parse_vcs_url("git+ssh://git@git.test/org/project.git@abc123") == ( + "ssh://git@git.test/org/project.git", + "abc123", + ) + # require_ref=False returns GIT_HEAD when no ref is present + assert parse_vcs_url("git+https://git.test/org/project.git", require_ref=False) == ( + "https://git.test/org/project.git", + GIT_HEAD, + ) + + +def test_parse_vcs_url_errors() -> None: + with pytest.raises(ValueError, match="missing a mandatory ref"): + parse_vcs_url("git+https://git.test/org/project.git") + with pytest.raises(ValueError, match="empty ref"): + parse_vcs_url("git+https://git.test/org/project.git@") + with pytest.raises(ValueError, match="empty ref"): + parse_vcs_url("git+https://git.test/org/project.git@", require_ref=False) + with pytest.raises(ValueError, match="unsupported VCS URL scheme"): + parse_vcs_url("git+http://git.test/org/project.git@v1.0") + + @pytest.mark.network @needs_git_command def test_git_clone_real(tmp_path: pathlib.Path) -> None: diff --git a/tests/test_sources.py b/tests/test_sources.py index 4e4a95903..239ce9ca1 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -654,7 +654,7 @@ def test_download_source_git_with_ref( mock_git.assert_called_once() call_kwargs = mock_git.call_args[1] assert call_kwargs["ref"] == "v2.0" - assert call_kwargs["url_to_clone"] == "git+https://github.com/org/pkg.git" + assert call_kwargs["url_to_clone"] == "https://github.com/org/pkg.git" assert result == tmp_context.work_dir / "pkg-2.0" / "pkg-2.0"