Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize ignore performance #4120

Merged
merged 19 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 132 additions & 22 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from funcy import cached_property
from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.path_info import PathInfo
from dvc.scm.tree import BaseTree
Expand All @@ -23,25 +24,33 @@ def __call__(self, root, dirs, files):


class DvcIgnorePatterns(DvcIgnore):
def __init__(self, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
def __init__(self, pattern_list, dirname):

self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + os.sep

regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, pattern_list
)

self.ignore_file_path = ignore_file_path
self.dirname = os.path.normpath(os.path.dirname(ignore_file_path))
self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
if ignore is not None
]

@classmethod
def from_files(cls, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
path_spec_lines = fobj.readlines()
regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, path_spec_lines
)
self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(
regex_pattern_list, lambda x: x[1]
)
if ignore is not None
path_spec_lines = [
line for line in map(str.strip, fobj.readlines()) if line
]

return cls(path_spec_lines, dirname)

def __call__(self, root, dirs, files):
files = [f for f in files if not self.matches(root, f)]
dirs = [d for d in dirs if not self.matches(root, d)]
Expand All @@ -51,11 +60,10 @@ def __call__(self, root, dirs, files):
def matches(self, dirname, basename):
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
prefix = self.dirname + os.sep
if dirname == self.dirname:
path = basename
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
elif dirname.startswith(self.prefix):
rel = dirname[len(self.prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
Expand All @@ -73,13 +81,107 @@ def ignore(self, path):
return result

def __hash__(self):
return hash(self.ignore_file_path)
return hash(self.dirname + ":" + "\n".join(self.pattern_list))

def __eq__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented
return (self.dirname == other.dirname) & (
self.pattern_list == other.pattern_list
)

def __bool__(self):
if self.pattern_list:
return True
return False
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved

def change_dirname(self, new_dirname):
prefix = new_dirname + os.sep
if new_dirname == self.dirname:
return self
if not self.dirname.startswith(prefix):
raise ValueError("change dirname can only change to parent path")
rel = self.dirname[len(prefix) :]
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
new_pattern_list = []
for rule in self.pattern_list:
if rule.startswith("!"):
rule = f"!{rel}{os.sep}{rule[1:]}"
else:
rule = f"{rel}{os.sep}{rule}"
rule = normalize_file(rule)
new_pattern_list.append(rule)
return DvcIgnorePatterns(new_pattern_list, new_dirname)

@staticmethod
def _longest_common_dir(dir1, dir2):
dir1_split = dir1.split(os.sep)
dir2_split = dir2.split(os.sep)
max_match = 0

for index, (i, j) in enumerate(zip(dir1_split, dir2_split)):
if i != j:
break
max_match = index
return os.sep.join(dir1_split[: max_match + 1])

def __add__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented

if not other:
merged = self
elif not self:
merged = other
else:
longest_common_dir = self._longest_common_dir(
self.dirname, other.dirname
)
self_to_lcd = self.change_dirname(longest_common_dir)
other_to_lcd = other.change_dirname(longest_common_dir)
if len(self.dirname) < len(other.dirname):
merged = DvcIgnorePatterns(
self_to_lcd.pattern_list + other_to_lcd.pattern_list,
longest_common_dir,
)
else:
merged = DvcIgnorePatterns(
other_to_lcd.pattern_list + self_to_lcd.pattern_list,
longest_common_dir,
)

return merged

__radd__ = __add__


class DvcIgnorePatternsTrie(DvcIgnore):
trie = None

return self.ignore_file_path == other.ignore_file_path
def __init__(self):
if self.trie is None:
self.trie = StringTrie(separator=os.sep)

def __new__(cls, *args, **kwargs):
if not hasattr(DvcIgnorePatterns, "_instance"):
if not hasattr(DvcIgnorePatterns, "_instance"):
DvcIgnorePatterns._instance = object.__new__(cls)
return DvcIgnorePatterns._instance
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved

def __call__(self, root, dirs, files):
ignore_pattern = self[root]
if ignore_pattern:
return ignore_pattern(root, dirs, files)
return dirs, files

def __setitem__(self, root, ignore_pattern):
base_pattern = self[root]
self.trie[root] = base_pattern + ignore_pattern

def __getitem__(self, root):
ignore_pattern = self.trie.longest_prefix(root)
if ignore_pattern:
return ignore_pattern.value
return DvcIgnorePatterns([], root)


class DvcIgnoreDirs(DvcIgnore):
Expand Down Expand Up @@ -121,14 +223,19 @@ def __init__(self, tree, root_dir):
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
DvcIgnoreRepo(),
}
for root, dirs, files in self.tree.walk(self.root_dir):
for root, dirs, _ in self.tree.walk(self.root_dir):
self._update(root)
dirs[:], files[:] = self(root, dirs, files)
dirs[:], _ = self(root, dirs, [])

def _update(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
if self.tree.exists(ignore_file_path):
self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree))
ignore_pattern = DvcIgnorePatterns.from_files(
ignore_file_path, self.tree
)
ignore_pattern_trie = DvcIgnorePatternsTrie()
ignore_pattern_trie[dirname] = ignore_pattern
self.ignores.add(ignore_pattern_trie)

def __call__(self, root, dirs, files):
for ignore in self.ignores:
Expand Down Expand Up @@ -248,3 +355,6 @@ def stat(self, path):
@property
def hash_jobs(self):
return self.tree.hash_jobs

def relative_path(self, abspath):
pass
efiop marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ count=true
[isort]
include_trailing_comma=true
known_first_party=dvc,tests
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,moto,nanotime,networkx,packaging,paramiko,pathspec,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,moto,nanotime,networkx,packaging,paramiko,pathspec,pygtrie,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
line_length=79
force_grid_wrap=0
use_parentheses=True
Expand Down
46 changes: 44 additions & 2 deletions tests/func/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DvcIgnore,
DvcIgnoreDirs,
DvcIgnorePatterns,
DvcIgnorePatternsTrie,
DvcIgnoreRepo,
)
from dvc.repo import Repo
Expand Down Expand Up @@ -98,12 +99,22 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname):

assert len(dvc.tree.dvcignore.ignores) == 3
assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores
ignore_pattern_trie = None
for ignore in dvc.tree.dvcignore.ignores:
if isinstance(ignore, DvcIgnorePatternsTrie):
ignore_pattern_trie = ignore

print(os.fspath(top_ignore_file))
print(os.fspath(ignore_file))

assert ignore_pattern_trie is not None
assert (
DvcIgnorePatterns(
DvcIgnorePatterns.from_files(
os.fspath(top_ignore_file), WorkingTree(dvc.root_dir)
)
in dvc.tree.dvcignore.ignores
== ignore_pattern_trie[os.fspath(ignore_file)]
)

assert any(
i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo)
)
Expand Down Expand Up @@ -173,3 +184,34 @@ def test_ignore_blank_line(tmp_dir, dvc):
tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored")

assert _files_set("dir", dvc.tree) == {"dir/other"}


def test_multi_ignore_file(tmp_dir, dvc, monkeypatch):
tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}})
tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore")
tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}})

assert _files_set("dir", dvc.tree) == {
"dir/subdir/not_ignore",
"dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
}


def test_no_ignore_in_parent_dir(tmp_dir, dvc, monkeypatch):
tmp_dir.gen(
{
"dir": {
"subdir": {"should_ignore": "1", "not_ignore": "1"},
"file1": "a",
}
}
)
tmp_dir.gen(
{"dir": {"subdir": {DvcIgnore.DVCIGNORE_FILE: "should_ignore"}}}
)

assert _files_set("dir", dvc.tree) == {
"dir/file1",
"dir/subdir/not_ignore",
"dir/subdir/{}".format(DvcIgnore.DVCIGNORE_FILE),
}
2 changes: 1 addition & 1 deletion tests/unit/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def mock_dvcignore(dvcignore_path, patterns):
tree = MagicMock()
with patch.object(tree, "open", mock_open(read_data="\n".join(patterns))):
ignore_patterns = DvcIgnorePatterns(dvcignore_path, tree)
ignore_patterns = DvcIgnorePatterns.from_files(dvcignore_path, tree)

return ignore_patterns

Expand Down