diff --git a/dvc/fs/utils.py b/dvc/fs/utils.py index cee9ae16fe..9da6e14946 100644 --- a/dvc/fs/utils.py +++ b/dvc/fs/utils.py @@ -14,27 +14,48 @@ logger = logging.getLogger(__name__) +def _link( + from_fs: "BaseFileSystem", + from_info: "AnyPath", + to_fs: "BaseFileSystem", + to_info: "DvcPath", + hardlink: bool = False, +) -> None: + if not isinstance(from_fs, type(to_fs)): + raise OSError(errno.EXDEV, "can't link across filesystems") + + links = ["reflink"] + ["hardlink"] if hardlink else [] + while links: + link = links.pop(0) + try: + func = getattr(to_fs, link) + except AttributeError: + continue + + try: + return func(from_info, to_info) + except RemoteActionNotImplemented: + continue + except OSError as exc: + if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: + raise + + raise OSError(errno.ENOTSUP, "reflink and hardlink are not supported") + + def transfer( from_fs: "BaseFileSystem", from_info: "AnyPath", to_fs: "BaseFileSystem", to_info: "DvcPath", - move: bool = False, + hardlink: bool = False, ) -> None: - same_fs = isinstance(from_fs, type(to_fs)) - use_move = same_fs and move try: - if use_move: - return to_fs.move(from_info, to_info) - - if same_fs: - try: - return from_fs.reflink(from_info, to_info) - except RemoteActionNotImplemented: - pass - except OSError as exc: - if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: - raise + try: + return _link(from_fs, from_info, to_fs, to_info, hardlink=hardlink) + except OSError as exc: + if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: + raise if isinstance(from_fs, LocalFileSystem): if not isinstance(from_info, from_fs.PATH_CLS): @@ -61,8 +82,6 @@ def transfer( and isinstance(exc.__context__, FileExistsError) ): logger.debug("'%s' file already exists, skipping", to_info) - if use_move: - from_fs.remove(from_info) return None raise diff --git a/dvc/objects/db/base.py b/dvc/objects/db/base.py index f0835b6c76..927c8b41b0 100644 --- a/dvc/objects/db/base.py +++ b/dvc/objects/db/base.py @@ -81,13 +81,17 @@ def _add_file( from_info: "AnyPath", to_info: "DvcPath", _hash_info: "HashInfo", - move: bool = False, + hardlink: bool = False, ): from dvc import fs self.makedirs(to_info.parent) return fs.utils.transfer( - from_fs, from_info, self.fs, to_info, move=move + from_fs, + from_info, + self.fs, + to_info, + hardlink=hardlink, ) def add( @@ -95,7 +99,7 @@ def add( path_info: "AnyPath", fs: "BaseFileSystem", hash_info: "HashInfo", - move: bool = True, + hardlink: bool = False, verify: Optional[bool] = None, ): if self.read_only: @@ -110,7 +114,7 @@ def add( pass cache_info = self.hash_to_path_info(hash_info.value) - self._add_file(fs, path_info, cache_info, hash_info, move=move) + self._add_file(fs, path_info, cache_info, hash_info, hardlink=hardlink) try: if verify: diff --git a/dvc/objects/db/reference.py b/dvc/objects/db/reference.py index 0332a924c3..d0affe89f1 100644 --- a/dvc/objects/db/reference.py +++ b/dvc/objects/db/reference.py @@ -58,12 +58,16 @@ def _add_file( from_info: "AnyPath", to_info: "DvcPath", hash_info: "HashInfo", - move: bool = False, + hardlink: bool = False, ): self.makedirs(to_info.parent) if hash_info.isdir: return super()._add_file( - from_fs, from_info, to_info, hash_info, move + from_fs, + from_info, + to_info, + hash_info, + hardlink=hardlink, ) ref_file = ReferenceHashFile(from_info, from_fs, hash_info) self._obj_cache[hash_info] = ref_file diff --git a/dvc/objects/stage.py b/dvc/objects/stage.py index 2458c0530d..c82bb5656b 100644 --- a/dvc/objects/stage.py +++ b/dvc/objects/stage.py @@ -88,7 +88,7 @@ def _stage_file(path_info, fs, name, odb=None, upload_odb=None, dry_run=False): if dry_run: obj = HashFile(path_info, fs, hash_info) else: - odb.add(path_info, fs, hash_info, move=False) + odb.add(path_info, fs, hash_info, hardlink=False) obj = odb.get(hash_info) return path_info, meta, obj @@ -175,7 +175,7 @@ def _stage_tree(path_info, fs, fs_info, name, odb=None, **kwargs): path_info, fs ) tree.digest(hash_info=hash_info) - odb.add(tree.path_info, tree.fs, tree.hash_info, move=True) + odb.add(tree.path_info, tree.fs, tree.hash_info, hardlink=False) raw = odb.get(tree.hash_info) # cleanup unneeded memfs tmpfile and return tree based on the # ODB fs/path diff --git a/dvc/objects/transfer.py b/dvc/objects/transfer.py index 00078d1aef..965214a11b 100644 --- a/dvc/objects/transfer.py +++ b/dvc/objects/transfer.py @@ -133,7 +133,7 @@ def transfer( obj_ids: Iterable["HashInfo"], jobs: Optional[int] = None, verify: bool = False, - move: bool = False, + hardlink: bool = False, **kwargs, ) -> int: """Transfer (copy) the specified objects from one ODB to another. @@ -157,7 +157,11 @@ def transfer( def func(hash_info: "HashInfo") -> None: obj = src.get(hash_info) return dest.add( - obj.path_info, obj.fs, obj.hash_info, verify=verify, move=move + obj.path_info, + obj.fs, + obj.hash_info, + verify=verify, + hardlink=hardlink, ) total = len(status.new) diff --git a/dvc/output.py b/dvc/output.py index 814b51c6b5..d12ea209d6 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -600,7 +600,7 @@ def commit(self, filter_info=None): self.odb, {obj.hash_info}, shallow=False, - move=True, + hardlink=True, ) checkout( filter_info or self.path_info, @@ -629,7 +629,7 @@ def _commit_granular_dir(self, filter_info): self.odb, {save_obj.hash_info} | {oid for _, _, oid in save_obj}, shallow=True, - move=True, + hardlink=True, ) return checkout_obj @@ -826,7 +826,7 @@ def transfer( odb, {obj.hash_info}, jobs=jobs, - move=upload, + hardlink=False, shallow=False, ) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index fc504e3613..517816cdcb 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -496,27 +496,27 @@ def test_should_update_state_entry_for_directory_after_add( ret = main(["add", "data"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ls = "dir" if os.name == "nt" else "ls" ret = main( ["run", "--single-stage", "-d", "data", "{} {}".format(ls, "data")] ) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 os.rename("data", "data" + ".back") ret = main(["checkout"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 class TestAddCommit(TestDvc): @@ -537,15 +537,15 @@ def test_should_collect_dir_cache_only_once(mocker, tmp_dir, dvc): counter = mocker.spy(dvc_module.objects.stage, "_stage_tree") ret = main(["add", "data"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 class TestShouldPlaceStageInDataDirIfRepositoryBelowSymlink(TestDvc): @@ -964,7 +964,7 @@ def test_add_with_cache_link_error(tmp_dir, dvc, mocker, capsys): err = capsys.readouterr()[1] assert "reconfigure cache types" in err - assert not (tmp_dir / "foo").exists() + assert (tmp_dir / "foo").exists() assert (tmp_dir / "foo.dvc").exists() assert (tmp_dir / ".dvc" / "cache").read_text() == { "ac": {"bd18db4cc2f85cedef654fccc4a4d8": "foo"} diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index bccbd8eca1..004b076fb6 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -216,7 +216,7 @@ def test_subrepos_are_ignored(tmp_dir, erepo_dir): repo.odb.local, {obj.hash_info}, shallow=False, - move=True, + hardlink=True, ) assert set(cache_dir.glob("??/*")) == { cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir", diff --git a/tests/unit/objects/db/test_local.py b/tests/unit/objects/db/test_local.py index 2c71d3960c..915d2dadc2 100644 --- a/tests/unit/objects/db/test_local.py +++ b/tests/unit/objects/db/test_local.py @@ -103,10 +103,9 @@ def test_staging_file(tmp_dir, dvc): check(local_odb, obj) check(staging_odb, obj) - transfer(staging_odb, local_odb, {obj.hash_info}, move=True) + transfer(staging_odb, local_odb, {obj.hash_info}, hardlink=True) check(local_odb, obj) - with pytest.raises(FileNotFoundError): - check(staging_odb, obj) + check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info) @@ -130,10 +129,11 @@ def test_staging_dir(tmp_dir, dvc): check(local_odb, obj) check(staging_odb, obj) - transfer(staging_odb, local_odb, {obj.hash_info}, shallow=False, move=True) + transfer( + staging_odb, local_odb, {obj.hash_info}, shallow=False, hardlink=True + ) check(local_odb, obj) - with pytest.raises(FileNotFoundError): - check(staging_odb, obj) + check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info)