diff --git a/README.md b/README.md index ae8a6f9..066e545 100644 --- a/README.md +++ b/README.md @@ -230,19 +230,35 @@ Visit [Tutorial: Create a simple pipeline (S3 bucket)](https://docs.aws.amazon.c To use LFS you need to first install git-lfs. You can refer to the [official documentation](https://git-lfs.com/) on how to do this on your system. -Next, you need enable the S3 integration by running the following command in the repo folder: +Next, enable the S3 integration in the repo. There are two install modes: ```bash +# Per-remote (recommended; required when other LFS remotes coexist) +git-lfs-s3 install --remote + +# Unscoped (back-compat; applies the agent to ALL remotes in the repo) git-lfs-s3 install ``` -which is a short cut for: +`--remote` writes a per-remote scoped configuration so `git-lfs-s3` only fires for that one remote — letting an S3 remote coexist with non-S3 LFS remotes (e.g. GitHub, GitLab) without breaking their LFS push/pull. Use it whenever the repo has more than one remote. + +The bare `git-lfs-s3 install` form sets `lfs.standalonetransferagent` globally and is short for: ```bash git config --add lfs.customtransfer.git-lfs-s3.path git-lfs-s3 git config --add lfs.standalonetransferagent git-lfs-s3 ``` +`git-lfs-s3 install --remote ` instead writes: + +```bash +git config remote..lfsurl https://lfs-alias.git-remote-s3.test// +git config lfs..standalonetransferagent git-lfs-s3 +git config lfs.customtransfer.git-lfs-s3.path git-lfs-s3 +``` + +The `lfs-alias.git-remote-s3.test` host is a synthetic, never-contacted match key (the `.test` TLD is reserved by RFC 6761 for non-resolvable use). It exists only because git-lfs's URL parser does not natively understand `s3://` URLs and would otherwise fall back to SSH-style endpoint discovery; setting `remote..lfsurl` short-circuits that path and gives the scoped agent lookup a stable URL to match against. + ### Creating the repo and pushing Let's assume we want to store TIFF file in LFS. @@ -252,13 +268,13 @@ mkdir lfs-repo cd lfs-repo git init git lfs install -git-lfs-s3 install +git remote add origin s3://my-git-bucket/lfs-repo +git-lfs-s3 install --remote origin git lfs track "*.tiff" git add .gitattributes git add file.tiff git commit -a -m "my first tiff file" -git remote add origin s3://my-git-bucket/lfs-repo git push --set-upstream origin main ``` @@ -278,7 +294,7 @@ To fix: ```bash cd lfs-repo-clone -git-lfs-s3 install +git-lfs-s3 install --remote origin git reset --hard main ``` diff --git a/git_remote_s3/common.py b/git_remote_s3/common.py index 9a402db..a2d90c1 100644 --- a/git_remote_s3/common.py +++ b/git_remote_s3/common.py @@ -34,3 +34,18 @@ def parse_git_url(url: str) -> tuple[UriScheme, str, str, str]: uri_scheme = UriScheme.S3_ZIP return uri_scheme, profile, bucket, prefix + + +LFS_ALIAS_HOST = "lfs-alias.git-remote-s3.test" + + +def synthetic_lfs_url(bucket: str, prefix: str) -> str: + """Builds the synthetic LFS endpoint URL for a given bucket and prefix. + + The URL is never contacted; it is purely a stable match key so that + ``lfs..standalonetransferagent`` can be scoped per remote, and so + git-lfs's HTTPS-shaped endpoint resolution short-circuits its SSH-style + discovery for ``s3://`` URLs. The hostname uses the RFC 6761-reserved + ``.test`` TLD to guarantee non-collision with any real host. + """ + return f"https://{LFS_ALIAS_HOST}/{bucket}/{prefix}" diff --git a/git_remote_s3/lfs.py b/git_remote_s3/lfs.py index 1c3c990..b465ab3 100644 --- a/git_remote_s3/lfs.py +++ b/git_remote_s3/lfs.py @@ -9,7 +9,8 @@ import boto3 import threading import os -from .common import parse_git_url +from typing import Optional +from .common import parse_git_url, synthetic_lfs_url from .git import validate_ref_name if "lfs" in __name__: @@ -131,32 +132,156 @@ def download(self, event: dict): sys.stdout.flush() -def install(): - result = subprocess.run( - ["git", "config", "--add", "lfs.customtransfer.git-lfs-s3.path", "git-lfs-s3"], +def _git_config_get(key: str) -> Optional[str]: + """Returns the current value of a git config key, or None if unset.""" + res = subprocess.run( + ["git", "config", "--get", key], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - if result.returncode != 0: - sys.stderr.write(result.stderr.decode("utf-8").strip()) + if res.returncode != 0: + return None + return res.stdout.decode("utf-8").strip() + + +def _git_config_set(key: str, value: str) -> None: + """Sets a git config key to value, replacing any existing values.""" + res = subprocess.run( + ["git", "config", "--replace-all", key, value], + stderr=subprocess.PIPE, + ) + if res.returncode != 0: + sys.stderr.write(res.stderr.decode("utf-8").strip() + "\n") sys.stderr.flush() sys.exit(1) - result = subprocess.run( - ["git", "config", "--add", "lfs.standalonetransferagent", "git-lfs-s3"], + + +def _list_git_remotes() -> list: + """Returns the list of configured git remote names (empty on error).""" + res = subprocess.run( + ["git", "remote"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if res.returncode != 0: + return [] + return [r for r in res.stdout.decode("utf-8").splitlines() if r.strip()] + + +def _resolve_s3_remote(remote_name: str) -> tuple: + """Validates that remote_name exists and points at an S3 URL. + + Returns (bucket, prefix). Exits 1 with a clear error message otherwise. + """ + res = subprocess.run( + ["git", "remote", "get-url", remote_name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - if result.returncode != 0: - sys.stderr.write(result.stderr.decode("utf-8").strip()) + if res.returncode != 0: + sys.stderr.write( + f"error: remote '{remote_name}' is not configured. " + f"Add it first with: " + f"git remote add {remote_name} s3:///\n" + ) sys.stderr.flush() sys.exit(1) + url = res.stdout.decode("utf-8").strip() + _, _, bucket, prefix = parse_git_url(url) + if bucket is None or prefix is None: + sys.stderr.write( + f"error: remote '{remote_name}' has URL '{url}', which is not " + f"an s3:// or s3+zip:// URL. --remote can only scope LFS " + f"configuration for S3 remotes.\n" + ) + sys.stderr.flush() + sys.exit(1) + return bucket, prefix + +def install(*, remote_name: Optional[str] = None) -> None: + """Installs git-lfs-s3 as a custom transfer agent. + + With remote_name=None, writes unscoped configuration that applies to + every remote in the repo (back-compat). With remote_name set, writes + per-remote scoped configuration so the agent only fires for that one + remote — required for coexistence with non-S3 LFS remotes. + """ + if remote_name is None: + _install_unscoped() + else: + _install_scoped(remote_name) + + +def _install_unscoped() -> None: + remotes = _list_git_remotes() + if len(remotes) > 1: + sys.stderr.write( + f"warning: multiple remotes configured ({', '.join(remotes)}); " + "'git-lfs-s3 install' writes unscoped configuration that " + "applies to ALL remotes. If any non-S3 remote uses LFS, " + "push/pull may fail. Use 'git-lfs-s3 install --remote ' " + "to scope to a single S3 remote.\n" + ) + sys.stderr.flush() + _git_config_set("lfs.customtransfer.git-lfs-s3.path", "git-lfs-s3") + _git_config_set("lfs.standalonetransferagent", "git-lfs-s3") sys.stdout.write("git-lfs-s3 installed\n") sys.stdout.flush() +def _install_scoped(remote_name: str) -> None: + bucket, prefix = _resolve_s3_remote(remote_name) + lfs_url = synthetic_lfs_url(bucket, prefix) + + existing_lfsurl = _git_config_get(f"remote.{remote_name}.lfsurl") + if existing_lfsurl is not None and existing_lfsurl != lfs_url: + sys.stderr.write( + f"error: remote.{remote_name}.lfsurl is already set to " + f"'{existing_lfsurl}'. git-lfs-s3 will not overwrite an " + f"existing LFS URL. If this was set in error, unset it with:\n" + f" git config --unset remote.{remote_name}.lfsurl\n" + ) + sys.stderr.flush() + sys.exit(1) + + if _git_config_get("lfs.standalonetransferagent") is not None: + sys.stderr.write( + "warning: lfs.standalonetransferagent is set unscoped; this " + "applies git-lfs-s3 to ALL remotes and will defeat per-remote " + "scoping. Unset it with:\n" + " git config --unset lfs.standalonetransferagent\n" + ) + sys.stderr.flush() + + _git_config_set("lfs.customtransfer.git-lfs-s3.path", "git-lfs-s3") + _git_config_set(f"remote.{remote_name}.lfsurl", lfs_url) + _git_config_set(f"lfs.{lfs_url}.standalonetransferagent", "git-lfs-s3") + sys.stdout.write( + f"git-lfs-s3 installed for remote '{remote_name}' " f"(LFS alias: {lfs_url})\n" + ) + sys.stdout.flush() + + def main(): # noqa: C901 if len(sys.argv) > 1: if "install" == sys.argv[1]: - install() + remote_name: Optional[str] = None + args = sys.argv[2:] + i = 0 + while i < len(args): + if args[i] == "--remote": + if i + 1 >= len(args): + sys.stderr.write("error: --remote requires a value\n") + sys.stderr.flush() + sys.exit(2) + remote_name = args[i + 1] + i += 2 + else: + sys.stderr.write(f"error: unknown argument to install: {args[i]}\n") + sys.stderr.flush() + sys.exit(2) + install(remote_name=remote_name) sys.exit(0) elif "debug" == sys.argv[1]: logger.setLevel(logging.DEBUG) diff --git a/test/lfs_install_test.py b/test/lfs_install_test.py new file mode 100644 index 0000000..fd4d7fd --- /dev/null +++ b/test/lfs_install_test.py @@ -0,0 +1,183 @@ +# SPDX-FileCopyrightText: 2023-present Amazon.com, Inc. or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import pytest + +from git_remote_s3 import lfs +from git_remote_s3.common import synthetic_lfs_url, LFS_ALIAS_HOST + + +def _git(args, cwd): + res = subprocess.run(args, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + assert res.returncode == 0, res.stderr.decode() + return res.stdout.decode().strip() + + +def _git_config_get_all(key, cwd): + res = subprocess.run( + ["git", "config", "--get-all", key], + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if res.returncode != 0: + return [] + return [line for line in res.stdout.decode().splitlines() if line.strip()] + + +@pytest.fixture +def repo(tmp_path, monkeypatch): + _git(["git", "init", "-q", "-b", "main", str(tmp_path)], cwd=tmp_path) + _git(["git", "config", "user.email", "test@example.com"], cwd=tmp_path) + _git(["git", "config", "user.name", "Test"], cwd=tmp_path) + monkeypatch.chdir(tmp_path) + return tmp_path + + +def test_synthetic_lfs_url_is_deterministic(): + assert ( + synthetic_lfs_url("my-bucket", "path/to/repo") + == f"https://{LFS_ALIAS_HOST}/my-bucket/path/to/repo" + ) + + +def test_synthetic_lfs_url_uses_reserved_tld(): + assert LFS_ALIAS_HOST.endswith(".test") + + +def test_install_bare_one_remote_writes_unscoped_config(repo, capsys): + _git(["git", "remote", "add", "origin", "s3://bucket/repo"], cwd=repo) + + lfs.install() + + assert _git_config_get_all("lfs.customtransfer.git-lfs-s3.path", repo) == [ + "git-lfs-s3" + ] + assert _git_config_get_all("lfs.standalonetransferagent", repo) == ["git-lfs-s3"] + captured = capsys.readouterr() + assert "git-lfs-s3 installed" in captured.out + assert "warning" not in captured.err.lower() + + +def test_install_bare_multiple_remotes_warns(repo, capsys): + _git(["git", "remote", "add", "origin", "ssh://git@example.com/repo.git"], cwd=repo) + _git(["git", "remote", "add", "s3", "s3://bucket/repo"], cwd=repo) + + lfs.install() + + assert _git_config_get_all("lfs.standalonetransferagent", repo) == ["git-lfs-s3"] + captured = capsys.readouterr() + assert "warning" in captured.err.lower() + assert "--remote" in captured.err + + +def test_install_remote_nonexistent_exits(repo, capsys): + with pytest.raises(SystemExit) as exc: + lfs.install(remote_name="missing") + assert exc.value.code == 1 + captured = capsys.readouterr() + assert "not configured" in captured.err + assert "missing" in captured.err + + +def test_install_remote_non_s3_exits(repo, capsys): + _git( + ["git", "remote", "add", "github", "ssh://git@github.com/example/repo.git"], + cwd=repo, + ) + + with pytest.raises(SystemExit) as exc: + lfs.install(remote_name="github") + assert exc.value.code == 1 + captured = capsys.readouterr() + assert "not an s3" in captured.err.lower() + + +def test_install_remote_s3_writes_scoped_config(repo, capsys): + _git(["git", "remote", "add", "s3", "s3://bucket/repo"], cwd=repo) + expected_url = synthetic_lfs_url("bucket", "repo") + + lfs.install(remote_name="s3") + + assert _git_config_get_all("remote.s3.lfsurl", repo) == [expected_url] + assert _git_config_get_all(f"lfs.{expected_url}.standalonetransferagent", repo) == [ + "git-lfs-s3" + ] + assert _git_config_get_all("lfs.customtransfer.git-lfs-s3.path", repo) == [ + "git-lfs-s3" + ] + assert _git_config_get_all("lfs.standalonetransferagent", repo) == [] + captured = capsys.readouterr() + assert "installed for remote 's3'" in captured.out + assert expected_url in captured.out + + +def test_install_remote_s3_zip_writes_scoped_config(repo): + _git(["git", "remote", "add", "s3", "s3+zip://bucket/repo"], cwd=repo) + expected_url = synthetic_lfs_url("bucket", "repo") + + lfs.install(remote_name="s3") + + assert _git_config_get_all("remote.s3.lfsurl", repo) == [expected_url] + assert _git_config_get_all(f"lfs.{expected_url}.standalonetransferagent", repo) == [ + "git-lfs-s3" + ] + + +def test_install_remote_is_idempotent(repo): + _git(["git", "remote", "add", "s3", "s3://bucket/repo"], cwd=repo) + + lfs.install(remote_name="s3") + lfs.install(remote_name="s3") + + expected_url = synthetic_lfs_url("bucket", "repo") + assert _git_config_get_all("remote.s3.lfsurl", repo) == [expected_url] + assert _git_config_get_all(f"lfs.{expected_url}.standalonetransferagent", repo) == [ + "git-lfs-s3" + ] + assert _git_config_get_all("lfs.customtransfer.git-lfs-s3.path", repo) == [ + "git-lfs-s3" + ] + + +def test_install_remote_refuses_to_overwrite_existing_lfsurl(repo, capsys): + _git(["git", "remote", "add", "s3", "s3://bucket/repo"], cwd=repo) + _git( + [ + "git", + "config", + "remote.s3.lfsurl", + "https://real-lfs.example.com/foo", + ], + cwd=repo, + ) + + with pytest.raises(SystemExit) as exc: + lfs.install(remote_name="s3") + assert exc.value.code == 1 + captured = capsys.readouterr() + assert "already set" in captured.err + assert "https://real-lfs.example.com/foo" in captured.err + assert _git_config_get_all("remote.s3.lfsurl", repo) == [ + "https://real-lfs.example.com/foo" + ] + + +def test_install_remote_warns_on_existing_unscoped_agent(repo, capsys): + _git(["git", "remote", "add", "s3", "s3://bucket/repo"], cwd=repo) + _git( + ["git", "config", "lfs.standalonetransferagent", "git-lfs-s3"], + cwd=repo, + ) + + lfs.install(remote_name="s3") + + captured = capsys.readouterr() + assert "warning" in captured.err.lower() + assert "lfs.standalonetransferagent" in captured.err + expected_url = synthetic_lfs_url("bucket", "repo") + assert _git_config_get_all(f"lfs.{expected_url}.standalonetransferagent", repo) == [ + "git-lfs-s3" + ]