From 253a0eae1374ff0bc81046507714353af977d8f3 Mon Sep 17 00:00:00 2001
From: karajan1001 <mishanyo1001@gmail.com>
Date: Wed, 8 Jul 2020 04:04:52 +0800
Subject: [PATCH] Add more tests according to gitignore (#4166)

* Add more tests according to .gitignore

* Test update

* Update test

* Solve question of path

* update one test

* Update 2 tests

* Add one tests

* comment those failed tests caused by other package

* parametrize some tests

Co-authored-by: karajan1001 <jah@mail.ustc.edu.cn>
---
 dvc/ignore.py             |  20 +++--
 tests/func/test_ignore.py |  63 +++++++++++++++
 tests/unit/test_ignore.py | 160 ++++++++++++++++++++++++++++----------
 3 files changed, 195 insertions(+), 48 deletions(-)

diff --git a/dvc/ignore.py b/dvc/ignore.py
index c9257e5204..35d7ede2d4 100644
--- a/dvc/ignore.py
+++ b/dvc/ignore.py
@@ -44,11 +44,11 @@ def __init__(self, ignore_file_path, tree):
 
     def __call__(self, root, dirs, files):
         files = [f for f in files if not self.matches(root, f)]
-        dirs = [d for d in dirs if not self.matches(root, d)]
+        dirs = [d for d in dirs if not self.matches(root, d, True)]
 
         return dirs, files
 
-    def matches(self, dirname, basename):
+    def matches(self, dirname, basename, is_dir=False):
         # NOTE: `relpath` is too slow, so we have to assume that both
         # `dirname` and `self.dirname` are relative or absolute together.
         prefix = self.dirname + os.sep
@@ -63,13 +63,19 @@ def matches(self, dirname, basename):
 
         if not System.is_unix():
             path = normalize_file(path)
-        return self.ignore(path)
+        return self.ignore(path, is_dir)
 
-    def ignore(self, path):
+    def ignore(self, path, is_dir):
         result = False
-        for ignore, pattern in self.ignore_spec:
-            if pattern.match(path):
-                result = ignore
+        if is_dir:
+            path_dir = f"{path}/"
+            for ignore, pattern in self.ignore_spec:
+                if pattern.match(path) or pattern.match(path_dir):
+                    result = ignore
+        else:
+            for ignore, pattern in self.ignore_spec:
+                if pattern.match(path):
+                    result = ignore
         return result
 
     def __hash__(self):
diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py
index 03b1e95678..ee9abd9431 100644
--- a/tests/func/test_ignore.py
+++ b/tests/func/test_ignore.py
@@ -173,3 +173,66 @@ def test_ignore_blank_line(tmp_dir, dvc):
     tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored")
 
     assert _files_set("dir", dvc.tree) == {"dir/other"}
+
+
+# It is not possible to re-include a file if a parent directory of
+# that file is excluded.
+# Git doesn’t list excluded directories for performance reasons,
+# so any patterns on contained files have no effect,
+# no matter where they are defined.
+@pytest.mark.parametrize(
+    "data_struct, pattern_list, result_set",
+    [
+        (
+            {"dir": {"subdir": {"not_ignore": "121"}}},
+            ["subdir/*", "!not_ignore"],
+            {"dir/subdir/not_ignore"},
+        ),
+        (
+            {"dir": {"subdir": {"should_ignore": "121"}}},
+            ["subdir", "!should_ignore"],
+            set(),
+        ),
+        (
+            {"dir": {"subdir": {"should_ignore": "121"}}},
+            ["subdir/", "!should_ignore"],
+            set(),
+        ),
+    ],
+)
+def test_ignore_file_in_parent_path(
+    tmp_dir, dvc, data_struct, pattern_list, result_set
+):
+    tmp_dir.gen(data_struct)
+    tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "\n".join(pattern_list))
+    assert _files_set("dir", dvc.tree) == result_set
+
+
+# If there is a separator at the end of the pattern then the pattern
+# will only match directories,
+# otherwise the pattern can match both files and directories.
+# For example, a pattern doc/frotz/ matches doc/frotz directory,
+# but not a/doc/frotz directory;
+def test_ignore_sub_directory(tmp_dir, dvc):
+    tmp_dir.gen(
+        {
+            "dir": {
+                "doc": {"fortz": {"b": "b"}},
+                "a": {"doc": {"fortz": {"a": "a"}}},
+            }
+        }
+    )
+    tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "doc/fortz"}})
+    assert _files_set("dir", dvc.tree) == {
+        "dir/a/doc/fortz/a",
+        "dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
+    }
+
+
+# however frotz/ matches frotz and a/frotz that is a directory
+def test_ignore_directory(tmp_dir, dvc):
+    tmp_dir.gen({"dir": {"fortz": {}, "a": {"fortz": {}}}})
+    tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "fortz"}})
+    assert _files_set("dir", dvc.tree) == {
+        "dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
+    }
diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py
index 40f936d091..85c46edee4 100644
--- a/tests/unit/test_ignore.py
+++ b/tests/unit/test_ignore.py
@@ -14,34 +14,51 @@ def mock_dvcignore(dvcignore_path, patterns):
     return ignore_patterns
 
 
-def test_ignore_from_file_should_filter_dirs_and_files():
-    dvcignore_path = os.path.join(
-        os.path.sep, "full", "path", "to", "ignore", "file", ".dvcignore"
-    )
-
-    patterns = ["dir_to_ignore", "file_to_ignore"]
-
-    root = os.path.dirname(dvcignore_path)
-    dirs = ["dir1", "dir2", "dir_to_ignore"]
-    files = ["file1", "file2", "file_to_ignore"]
-
-    ignore = mock_dvcignore(dvcignore_path, patterns)
-    new_dirs, new_files = ignore(root, dirs, files)
-
-    assert {"dir1", "dir2"} == set(new_dirs)
-    assert {"file1", "file2"} == set(new_files)
-
-
 @pytest.mark.parametrize(
     "file_to_ignore_relpath, patterns,  expected_match",
     [
+        # all rules from https://git-scm.com/docs/gitignore
         ("to_ignore", ["to_ignore"], True),
+        ("dont_ignore.txt", ["dont_ignore"], False),
+        # A blank line matches no files, so it can serve as a separator for
+        # readability.
+        ("to_ignore", ["", "to_ignore"], True),
+        # A line starting with # serves as a comment.
+        # Put a backslash ("\") in front of the first hash for patterns
+        # that begin with a hash.
+        ("#to_ignore", ["\\#to_ignore"], True),
+        ("#to_ignore", ["#to_ignore"], False),
+        # Trailing spaces are ignored unless they are quoted with
+        # backslash ("\").
+        (" to_ignore", [" to_ignore"], False),
+        (" to_ignore", ["\\ to_ignore"], True),
+        # An optional prefix "!" which negates the pattern; any matching file
+        # excluded by a previous pattern will become included again.
         ("to_ignore.txt", ["to_ignore*"], True),
-        (
-            os.path.join("rel", "p", "p2", "to_ignore"),
-            ["rel/**/to_ignore"],
-            True,
-        ),
+        ("to_ignore.txt", ["to_ignore*", "!to_ignore.txt"], False),
+        ("to_ignore.txt", ["!to_ignore.txt", "to_ignore*"], True),
+        # It is not possible to re-include a file if a parent directory of
+        # that file is excluded.
+        # Git doesn’t list excluded directories for performance reasons,
+        # so any patterns on contained files have no effect,
+        # no matter where they are defined.
+        # see (`tests/func/test_ignore.py::test_ignore_parent_path`)
+        # Put a backslash ("\") in front of the first "!"
+        # for patterns that begin with a literal "!",
+        # for example, "\!important!.txt".
+        ("!to_ignore.txt", ["\\!to_ignore.txt"], True),
+        # The slash / is used as the directory separator.
+        # Separators may occur at the beginning, middle or end of the
+        # .gitignore search pattern.
+        # If there is a separator at the beginning or middle (or both)
+        # of the pattern, then the pattern is relative to the directory
+        # level of the particular .gitignore file itself.
+        # Otherwise the pattern may also match at any level below
+        # the .gitignore level.
+        ("file", ["/file"], True),
+        (os.path.join("data", "file"), ["/file"], False),
+        (os.path.join("data", "file"), ["data/file"], True),
+        (os.path.join("other", "data", "file"), ["data/file"], False),
         (
             os.path.join(
                 os.path.sep,
@@ -55,20 +72,96 @@ def test_ignore_from_file_should_filter_dirs_and_files():
             ["to_ignore"],
             True,
         ),
+        # If there is a separator at the end of the pattern then the pattern
+        # will only match directories,
+        # otherwise the pattern can match both files and directories.
+        # For example, a pattern doc/frotz/ matches doc/frotz directory,
+        # but not a/doc/frotz directory;
+        # see (`tests/func/test_ignore.py::test_ignore_sub_directory`)
+        # however frotz/ matches frotz and a/frotz that is a directory
+        # (all paths are relative from the .gitignore file).
+        # see (`tests/func/test_ignore.py::test_ignore_directory`)
+        # An asterisk "*" matches anything except a slash.
         ("to_ignore.txt", ["/*.txt"], True),
+        (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False),
+        (os.path.join("data", "file.txt"), ["data/*"], True),
+        # wait for Git
+        # (os.path.join("data", "sub", "file.txt"), ["data/*"], True),
         (
             os.path.join("rel", "path", "path2", "to_ignore"),
             ["rel/*/to_ignore"],
             False,
         ),
-        (os.path.join("path", "to_ignore.txt"), ["/*.txt"], False),
+        ("file.txt", ["file.*"], True),
+        # The character "?" matches any one character except "/".
+        ("file.txt", ["fi?e.t?t"], True),
+        ("fi/e.txt", ["fi?e.t?t"], False),
+        # The range notation, e.g. [a-zA-Z], can be used
+        # to match one of the characters in a range. See fnmatch(3) and
+        # the FNM_PATHNAME flag for a more detailed description.
+        ("file.txt", ["[a-zA-Z]ile.txt"], True),
+        ("2ile.txt", ["[a-zA-Z]ile.txt"], False),
+        # Two consecutive asterisks ("**") in patterns matched against
+        # full pathname may have special meaning:
+        # A leading "**" followed by a slash means match in all directories.
+        # For example, "**/foo" matches file or directory "foo" anywhere, the
+        # same as pattern "foo".
+        # "**/foo/bar" matches file or directory "bar" anywhere that is
+        # directly under directory "foo".
+        (os.path.join("rel", "p", "p2", "to_ignore"), ["**/to_ignore"], True,),
+        (
+            os.path.join("rel", "p", "p2", "to_ignore"),
+            ["**/p2/to_ignore"],
+            True,
+        ),
+        (
+            os.path.join("rel", "path", "path2", "dont_ignore"),
+            ["**/to_ignore"],
+            False,
+        ),
+        # A trailing "/**" matches everything inside.
+        # For example, "abc/**" matches all files inside directory "abc",
+        # relative to the location of the .gitignore file, with infinite depth.
+        (os.path.join("rel", "p", "p2", "to_ignore"), ["rel/**"], True,),
+        (os.path.join("rel", "p", "p2", "to_ignore"), ["p/**"], False,),
+        (
+            os.path.join("rel", "path", "path2", "dont_ignore"),
+            ["rel/**"],
+            True,
+        ),
+        # A slash followed by two consecutive asterisks then a slash matches
+        # zero or more directories.
+        # For example, "a/**/b" matches "a/b", "a/x/b", "a/x/y/b" and so on.
+        (os.path.join("rel", "p", "to_ignore"), ["rel/**/to_ignore"], True,),
+        (
+            os.path.join("rel", "p", "p2", "to_ignore"),
+            ["rel/**/to_ignore"],
+            True,
+        ),
         (
             os.path.join("rel", "path", "path2", "dont_ignore"),
             ["rel/**/to_ignore"],
             False,
         ),
-        ("dont_ignore.txt", ["dont_ignore"], False),
-        ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False),
+        (
+            os.path.join("rel", "path", "path2", "dont_ignore"),
+            ["path/**/dont_ignore"],
+            False,
+        ),
+        # Other consecutive asterisks are considered regular asterisks
+        # and will match according to the previous rules.
+        ("to_ignore.txt", ["/***.txt"], True),
+        (os.path.join("path", "to_ignore.txt"), ["/****.txt"], False),
+        (os.path.join("path", "to_ignore.txt"), ["****.txt"], True),
+        (os.path.join("data", "file.txt"), ["data/***"], True),
+        # bug from PathSpec
+        # (os.path.join("data", "p", "file.txt"), ["data/***"], False),
+        (os.path.join("data", "p", "file.txt"), ["***/file.txt"], False),
+        (
+            os.path.join("rel", "path", "path2", "to_ignore"),
+            ["rel/***/to_ignore"],
+            False,
+        ),
     ],
 )
 def test_match_ignore_from_file(
@@ -99,18 +192,3 @@ def test_should_ignore_dir(omit_dir):
     new_dirs, _ = ignore(root, dirs, files)
 
     assert set(new_dirs) == {"dir1", "dir2"}
-
-
-def test_ignore_order():
-    dvcignore_path = os.path.join(os.path.sep, "ignore_order", ".dvcignore")
-
-    patterns = ["!ac*", "a*", "!ab*"]
-
-    root = os.path.dirname(dvcignore_path)
-    dirs = ["ignore_order"]
-    files = ["ac", "ab", "aa"]
-
-    ignore = mock_dvcignore(dvcignore_path, patterns)
-    _, new_files = ignore(root, dirs, files)
-
-    assert {"ab"} == set(new_files)