diff --git a/dvc/fs/local.py b/dvc/fs/local.py index 16867b620a..a236bace90 100644 --- a/dvc/fs/local.py +++ b/dvc/fs/local.py @@ -39,7 +39,11 @@ def exists(self, path_info) -> bool: return os.path.lexists(path_info) def checksum(self, path_info) -> str: - return self.fs.checksum(path_info) + from fsspec.utils import tokenize + + st = os.stat(path_info) + + return str(int(tokenize([st.st_ino, st.st_mtime, st.st_size]), 16)) def isfile(self, path_info) -> bool: return os.path.isfile(path_info) diff --git a/dvc/fs/utils.py b/dvc/fs/utils.py index cee9ae16fe..9da6e14946 100644 --- a/dvc/fs/utils.py +++ b/dvc/fs/utils.py @@ -14,27 +14,48 @@ logger = logging.getLogger(__name__) +def _link( + from_fs: "BaseFileSystem", + from_info: "AnyPath", + to_fs: "BaseFileSystem", + to_info: "DvcPath", + hardlink: bool = False, +) -> None: + if not isinstance(from_fs, type(to_fs)): + raise OSError(errno.EXDEV, "can't link across filesystems") + + links = ["reflink"] + ["hardlink"] if hardlink else [] + while links: + link = links.pop(0) + try: + func = getattr(to_fs, link) + except AttributeError: + continue + + try: + return func(from_info, to_info) + except RemoteActionNotImplemented: + continue + except OSError as exc: + if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: + raise + + raise OSError(errno.ENOTSUP, "reflink and hardlink are not supported") + + def transfer( from_fs: "BaseFileSystem", from_info: "AnyPath", to_fs: "BaseFileSystem", to_info: "DvcPath", - move: bool = False, + hardlink: bool = False, ) -> None: - same_fs = isinstance(from_fs, type(to_fs)) - use_move = same_fs and move try: - if use_move: - return to_fs.move(from_info, to_info) - - if same_fs: - try: - return from_fs.reflink(from_info, to_info) - except RemoteActionNotImplemented: - pass - except OSError as exc: - if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: - raise + try: + return _link(from_fs, from_info, to_fs, to_info, hardlink=hardlink) + except OSError as exc: + if exc.errno not in [errno.EXDEV, errno.ENOTSUP]: + raise if isinstance(from_fs, LocalFileSystem): if not isinstance(from_info, from_fs.PATH_CLS): @@ -61,8 +82,6 @@ def transfer( and isinstance(exc.__context__, FileExistsError) ): logger.debug("'%s' file already exists, skipping", to_info) - if use_move: - from_fs.remove(from_info) return None raise diff --git a/dvc/objects/checkout.py b/dvc/objects/checkout.py index 92cfc97a56..98414f1f2d 100644 --- a/dvc/objects/checkout.py +++ b/dvc/objects/checkout.py @@ -97,13 +97,13 @@ def _link(cache, from_info, to_info): raise CheckoutError([str(to_info)]) from exc -def _cache_is_copy(cache, path_info): +def _confirm_cache_type(cache, path_info): """Checks whether cache uses copies.""" if cache.cache_type_confirmed: - return cache.cache_types[0] == "copy" + return if set(cache.cache_types) <= {"copy"}: - return True + return workspace_file = path_info.with_name("." + uuid()) test_cache_file = cache.path_info / ".cache_type_test_file" @@ -118,7 +118,16 @@ def _cache_is_copy(cache, path_info): cache.fs.remove(test_cache_file) cache.cache_type_confirmed = True - return cache.cache_types[0] == "copy" + + +def _relink(cache, cache_info, fs, path_info, in_cache, force): + _remove(path_info, fs, in_cache, force=force) + _link(cache, cache_info, path_info) + # NOTE: Depending on a file system (e.g. on NTFS), `_remove` might reset + # read-only permissions in order to delete a hardlink to protected object, + # which will also reset it for the object itself, making it unprotected, + # so we need to protect it back. + cache.protect(cache_info) def _checkout_file( @@ -133,18 +142,33 @@ def _checkout_file( ): """The file is changed we need to checkout a new copy""" modified = False + + _confirm_cache_type(cache, path_info) + cache_info = cache.hash_to_path_info(change.new.oid.value) if change.old.oid: if relink: - if fs.iscopy(path_info) and _cache_is_copy(cache, path_info): + if fs.iscopy(path_info) and cache.cache_types[0] == "copy": cache.unprotect(path_info) else: - _remove(path_info, fs, change.old.in_cache, force=force) - _link(cache, cache_info, path_info) + _relink( + cache, + cache_info, + fs, + path_info, + change.old.in_cache, + force=force, + ) else: modified = True - _remove(path_info, fs, change.old.in_cache, force=force) - _link(cache, cache_info, path_info) + _relink( + cache, + cache_info, + fs, + path_info, + change.old.in_cache, + force=force, + ) else: _link(cache, cache_info, path_info) modified = True diff --git a/dvc/objects/db/base.py b/dvc/objects/db/base.py index f0835b6c76..927c8b41b0 100644 --- a/dvc/objects/db/base.py +++ b/dvc/objects/db/base.py @@ -81,13 +81,17 @@ def _add_file( from_info: "AnyPath", to_info: "DvcPath", _hash_info: "HashInfo", - move: bool = False, + hardlink: bool = False, ): from dvc import fs self.makedirs(to_info.parent) return fs.utils.transfer( - from_fs, from_info, self.fs, to_info, move=move + from_fs, + from_info, + self.fs, + to_info, + hardlink=hardlink, ) def add( @@ -95,7 +99,7 @@ def add( path_info: "AnyPath", fs: "BaseFileSystem", hash_info: "HashInfo", - move: bool = True, + hardlink: bool = False, verify: Optional[bool] = None, ): if self.read_only: @@ -110,7 +114,7 @@ def add( pass cache_info = self.hash_to_path_info(hash_info.value) - self._add_file(fs, path_info, cache_info, hash_info, move=move) + self._add_file(fs, path_info, cache_info, hash_info, hardlink=hardlink) try: if verify: diff --git a/dvc/objects/db/reference.py b/dvc/objects/db/reference.py index 0332a924c3..d0affe89f1 100644 --- a/dvc/objects/db/reference.py +++ b/dvc/objects/db/reference.py @@ -58,12 +58,16 @@ def _add_file( from_info: "AnyPath", to_info: "DvcPath", hash_info: "HashInfo", - move: bool = False, + hardlink: bool = False, ): self.makedirs(to_info.parent) if hash_info.isdir: return super()._add_file( - from_fs, from_info, to_info, hash_info, move + from_fs, + from_info, + to_info, + hash_info, + hardlink=hardlink, ) ref_file = ReferenceHashFile(from_info, from_fs, hash_info) self._obj_cache[hash_info] = ref_file diff --git a/dvc/objects/stage.py b/dvc/objects/stage.py index 2458c0530d..c82bb5656b 100644 --- a/dvc/objects/stage.py +++ b/dvc/objects/stage.py @@ -88,7 +88,7 @@ def _stage_file(path_info, fs, name, odb=None, upload_odb=None, dry_run=False): if dry_run: obj = HashFile(path_info, fs, hash_info) else: - odb.add(path_info, fs, hash_info, move=False) + odb.add(path_info, fs, hash_info, hardlink=False) obj = odb.get(hash_info) return path_info, meta, obj @@ -175,7 +175,7 @@ def _stage_tree(path_info, fs, fs_info, name, odb=None, **kwargs): path_info, fs ) tree.digest(hash_info=hash_info) - odb.add(tree.path_info, tree.fs, tree.hash_info, move=True) + odb.add(tree.path_info, tree.fs, tree.hash_info, hardlink=False) raw = odb.get(tree.hash_info) # cleanup unneeded memfs tmpfile and return tree based on the # ODB fs/path diff --git a/dvc/objects/transfer.py b/dvc/objects/transfer.py index 00078d1aef..965214a11b 100644 --- a/dvc/objects/transfer.py +++ b/dvc/objects/transfer.py @@ -133,7 +133,7 @@ def transfer( obj_ids: Iterable["HashInfo"], jobs: Optional[int] = None, verify: bool = False, - move: bool = False, + hardlink: bool = False, **kwargs, ) -> int: """Transfer (copy) the specified objects from one ODB to another. @@ -157,7 +157,11 @@ def transfer( def func(hash_info: "HashInfo") -> None: obj = src.get(hash_info) return dest.add( - obj.path_info, obj.fs, obj.hash_info, verify=verify, move=move + obj.path_info, + obj.fs, + obj.hash_info, + verify=verify, + hardlink=hardlink, ) total = len(status.new) diff --git a/dvc/output.py b/dvc/output.py index 814b51c6b5..d12ea209d6 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -600,7 +600,7 @@ def commit(self, filter_info=None): self.odb, {obj.hash_info}, shallow=False, - move=True, + hardlink=True, ) checkout( filter_info or self.path_info, @@ -629,7 +629,7 @@ def _commit_granular_dir(self, filter_info): self.odb, {save_obj.hash_info} | {oid for _, _, oid in save_obj}, shallow=True, - move=True, + hardlink=True, ) return checkout_obj @@ -826,7 +826,7 @@ def transfer( odb, {obj.hash_info}, jobs=jobs, - move=upload, + hardlink=False, shallow=False, ) diff --git a/dvc/system.py b/dvc/system.py index 43a0e4e187..893e460631 100644 --- a/dvc/system.py +++ b/dvc/system.py @@ -2,14 +2,42 @@ import logging import os import platform +import stat +import sys logger = logging.getLogger(__name__) +if os.name == "nt" and sys.version_info < (3, 8): + # NOTE: using backports for `os.path.realpath` + # See https://bugs.python.org/issue9949 for more info. + # pylint: disable=import-error, no-name-in-module + from jaraco.windows.filesystem.backports import realpath as _realpath + + def realpath(path): + return _realpath(os.fspath(path)) + + +else: + realpath = os.path.realpath + + class System: @staticmethod def hardlink(source, link_name): - os.link(source, link_name) + # NOTE: we should really be using `os.link()` here with + # `follow_symlinks=True`, but unfortunately the implementation is + # buggy across platforms, so until it is fixed, we just dereference + # the symlink ourselves here. + # + # See https://bugs.python.org/issue41355 for more info. + st = os.lstat(source) + if stat.S_ISLNK(st.st_mode): + src = realpath(source) + else: + src = source + + os.link(src, link_name) @staticmethod def symlink(source, link_name): diff --git a/setup.cfg b/setup.cfg index 15ec180797..544cdc68f6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,6 +75,7 @@ install_requires = fsspec[http]>=2021.10.1 aiohttp-retry>=2.4.5 diskcache>=5.2.1 + jaraco.windows>=5.7.0; python_version < '3.8' and sys_platform == 'win32' [options.extras_require] # gssapi should not be included in all_remotes, because it doesn't have wheels @@ -139,7 +140,6 @@ tests = Pygments==2.10.0 collective.checkdocs==0.2 pydocstyle==6.1.1 - jaraco.windows==5.7.0 # pylint requirements pylint==2.11.1 # we use this to suppress pytest-related false positives in our tests. diff --git a/tests/func/test_add.py b/tests/func/test_add.py index fc504e3613..517816cdcb 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -496,27 +496,27 @@ def test_should_update_state_entry_for_directory_after_add( ret = main(["add", "data"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ls = "dir" if os.name == "nt" else "ls" ret = main( ["run", "--single-stage", "-d", "data", "{} {}".format(ls, "data")] ) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 os.rename("data", "data" + ".back") ret = main(["checkout"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 ret = main(["status"]) assert ret == 0 - assert file_md5_counter.mock.call_count == 4 + assert file_md5_counter.mock.call_count == 3 class TestAddCommit(TestDvc): @@ -537,15 +537,15 @@ def test_should_collect_dir_cache_only_once(mocker, tmp_dir, dvc): counter = mocker.spy(dvc_module.objects.stage, "_stage_tree") ret = main(["add", "data"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 ret = main(["status"]) assert ret == 0 - assert counter.mock.call_count == 2 + assert counter.mock.call_count == 1 class TestShouldPlaceStageInDataDirIfRepositoryBelowSymlink(TestDvc): @@ -964,7 +964,7 @@ def test_add_with_cache_link_error(tmp_dir, dvc, mocker, capsys): err = capsys.readouterr()[1] assert "reconfigure cache types" in err - assert not (tmp_dir / "foo").exists() + assert (tmp_dir / "foo").exists() assert (tmp_dir / "foo.dvc").exists() assert (tmp_dir / ".dvc" / "cache").read_text() == { "ac": {"bd18db4cc2f85cedef654fccc4a4d8": "foo"} diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index bccbd8eca1..004b076fb6 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -216,7 +216,7 @@ def test_subrepos_are_ignored(tmp_dir, erepo_dir): repo.odb.local, {obj.hash_info}, shallow=False, - move=True, + hardlink=True, ) assert set(cache_dir.glob("??/*")) == { cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir", diff --git a/tests/unit/objects/db/test_local.py b/tests/unit/objects/db/test_local.py index 2c71d3960c..915d2dadc2 100644 --- a/tests/unit/objects/db/test_local.py +++ b/tests/unit/objects/db/test_local.py @@ -103,10 +103,9 @@ def test_staging_file(tmp_dir, dvc): check(local_odb, obj) check(staging_odb, obj) - transfer(staging_odb, local_odb, {obj.hash_info}, move=True) + transfer(staging_odb, local_odb, {obj.hash_info}, hardlink=True) check(local_odb, obj) - with pytest.raises(FileNotFoundError): - check(staging_odb, obj) + check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info) @@ -130,10 +129,11 @@ def test_staging_dir(tmp_dir, dvc): check(local_odb, obj) check(staging_odb, obj) - transfer(staging_odb, local_odb, {obj.hash_info}, shallow=False, move=True) + transfer( + staging_odb, local_odb, {obj.hash_info}, shallow=False, hardlink=True + ) check(local_odb, obj) - with pytest.raises(FileNotFoundError): - check(staging_odb, obj) + check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info)