Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize ignore performance #4120

Merged
merged 19 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 175 additions & 22 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from funcy import cached_property
from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.path_info import PathInfo
from dvc.scm.tree import BaseTree
Expand All @@ -23,25 +24,33 @@ def __call__(self, root, dirs, files):


class DvcIgnorePatterns(DvcIgnore):
def __init__(self, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
def __init__(self, pattern_list, dirname):

self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + os.sep

self.ignore_file_path = ignore_file_path
self.dirname = os.path.normpath(os.path.dirname(ignore_file_path))
regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, pattern_list
)

self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
if ignore is not None
]

@classmethod
def from_files(cls, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
path_spec_lines = fobj.readlines()
regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, path_spec_lines
)
self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(
regex_pattern_list, lambda x: x[1]
)
if ignore is not None
path_spec_lines = [
line for line in map(str.strip, fobj.readlines()) if line
]

return cls(path_spec_lines, dirname)

def __call__(self, root, dirs, files):
files = [f for f in files if not self.matches(root, f)]
dirs = [d for d in dirs if not self.matches(root, d)]
Expand All @@ -51,11 +60,10 @@ def __call__(self, root, dirs, files):
def matches(self, dirname, basename):
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
prefix = self.dirname + os.sep
if dirname == self.dirname:
path = basename
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
elif dirname.startswith(self.prefix):
rel = dirname[len(self.prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
Expand All @@ -73,13 +81,150 @@ def ignore(self, path):
return result

def __hash__(self):
return hash(self.ignore_file_path)
return hash(self.dirname + ":" + "\n".join(self.pattern_list))

def __eq__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented
print(self.pattern_list, other.pattern_list)
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
return (self.dirname == other.dirname) & (
self.pattern_list == other.pattern_list
)

def __bool__(self):
if self.pattern_list:
return True
return False
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _is_include(rule):
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
if rule.startswith("!"):
return True, rule[1:]
return False, rule

@staticmethod
def _is_comment(rule):
if rule.startswith("#"):
return True
return False
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _remove_slash(rule):
if rule.startswith("\\"):
return rule[1:]
return rule

@staticmethod
def _match_all_level(rule):
if rule[:-1].find("/") >= 0 and not rule.startswith("**/"):
if rule.startswith("/"):
rule = rule[1:]
return False, rule
if rule.startswith("**/"):
rule = rule[3:]
return True, rule

def change_rule(self, rule, rel):
rule = rule.strip()
if self._is_comment(rule):
return rule
is_include, rule = self._is_include(rule)
match_all, rule = self._match_all_level(rule)
rule = self._remove_slash(rule)
if not match_all:
rule = f"/{rule}"
else:
rule = f"/**/{rule}"
if is_include:
rule = f"!/{rel}{rule}"
else:
rule = f"/{rel}{rule}"
rule = normalize_file(rule)
return rule

def change_dirname(self, new_dirname):
if new_dirname == self.dirname:
return self
rel = os.path.relpath(self.dirname, new_dirname)
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

new_pattern_list = []
for rule in self.pattern_list:
rule = self.change_rule(rule, rel)
new_pattern_list.append(rule)
return DvcIgnorePatterns(new_pattern_list, new_dirname)

@staticmethod
def _longest_common_dir(dir1, dir2):
dir1_split = dir1.split(os.sep)
dir2_split = dir2.split(os.sep)
max_match = 0

for index, (i, j) in enumerate(zip(dir1_split, dir2_split)):
if i != j:
break
max_match = index
return os.sep.join(dir1_split[: max_match + 1])

def __add__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented

if not other:
merged = self
elif not self:
merged = other
else:
longest_common_dir = self._longest_common_dir(
self.dirname, other.dirname
)
self_to_lcd = self.change_dirname(longest_common_dir)
other_to_lcd = other.change_dirname(longest_common_dir)
if len(self.dirname) < len(other.dirname):
merged = DvcIgnorePatterns(
self_to_lcd.pattern_list + other_to_lcd.pattern_list,
longest_common_dir,
)
else:
merged = DvcIgnorePatterns(
other_to_lcd.pattern_list + self_to_lcd.pattern_list,
longest_common_dir,
)

return merged

__radd__ = __add__


return self.ignore_file_path == other.ignore_file_path
class DvcIgnorePatternsTrie(DvcIgnore):
trie = None

def __init__(self):
if self.trie is None:
self.trie = StringTrie(separator=os.sep)

def __new__(cls, *args, **kwargs):
if not hasattr(DvcIgnorePatterns, "_instance"):
if not hasattr(DvcIgnorePatterns, "_instance"):
DvcIgnorePatterns._instance = object.__new__(cls)
return DvcIgnorePatterns._instance
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved

def __call__(self, root, dirs, files):
ignore_pattern = self[root]
if ignore_pattern:
return ignore_pattern(root, dirs, files)
return dirs, files

def __setitem__(self, root, ignore_pattern):
base_pattern = self[root]
self.trie[root] = base_pattern + ignore_pattern

def __getitem__(self, root):
ignore_pattern = self.trie.longest_prefix(root)
if ignore_pattern:
return ignore_pattern.value
return DvcIgnorePatterns([], root)


class DvcIgnoreDirs(DvcIgnore):
Expand Down Expand Up @@ -121,14 +266,19 @@ def __init__(self, tree, root_dir):
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
DvcIgnoreRepo(),
}
for root, dirs, files in self.tree.walk(self.root_dir):
for root, dirs, _ in self.tree.walk(self.root_dir):
self._update(root)
dirs[:], files[:] = self(root, dirs, files)
dirs[:], _ = self(root, dirs, [])

def _update(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
if self.tree.exists(ignore_file_path):
self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree))
ignore_pattern = DvcIgnorePatterns.from_files(
ignore_file_path, self.tree
)
ignore_pattern_trie = DvcIgnorePatternsTrie()
ignore_pattern_trie[dirname] = ignore_pattern
self.ignores.add(ignore_pattern_trie)

def __call__(self, root, dirs, files):
for ignore in self.ignores:
Expand Down Expand Up @@ -248,3 +398,6 @@ def stat(self, path):
@property
def hash_jobs(self):
return self.tree.hash_jobs

def relative_path(self, abspath):
pass
efiop marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ count=true
[isort]
include_trailing_comma=true
known_first_party=dvc,tests
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pygtrie,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
line_length=79
force_grid_wrap=0
use_parentheses=True
Expand Down
Loading