From ac63803916010232fb1ddf63232f0936d67fa992 Mon Sep 17 00:00:00 2001
From: David Pilato <david@pilato.fr>
Date: Mon, 23 Jul 2018 15:59:43 +0200
Subject: [PATCH 1/2] Exclude dirs depending on dir full name (relative to
 root)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For example if I have a directory /mypath/folder with subdirectories

```
/mypath/folder
├── folderA
│   ├── subfolderA
│   ├── subfolderB
│   └── subfolderC
├── folderB
│   ├── subfolderA
│   ├── subfolderB
│   └── subfolderC
└── folderC
    ├── subfolderA
    ├── subfolderB
    └── subfolderC
```

I would like to be able to start crawling at `/mypath/folder` and crawl everything except `/folderB/subfolderB` for example.

I would like to be able to put `"excludes": ["/folderB/subfolderB"]` or even a wildcard like `"excludes": ["/folderB/subfolder*"]`.

Closes #553.
---
 .../elasticsearch/crawler/fs/FsParser.java    |  21 ++-
 docs/source/admin/fs/local-fs.rst             |  52 ++++++-
 docs/source/installation.rst                  | 133 +++++++++++-------
 .../crawler/fs/framework/FsCrawlerUtil.java   |  26 +++-
 .../integration/FsCrawlerTestIncludesIT.java  |   9 +-
 .../subdir/notsub/roottxtfile.txt             |   1 +
 .../subdir/sub1/roottxtfile.txt               |   1 +
 .../subdir/sub2/roottxtfile.txt               |   1 +
 .../elasticsearch/crawler/fs/settings/Fs.java |   2 +-
 .../crawler/fs/settings/FsMatchFilesTest.java | 108 +++++++++-----
 .../fs/settings/FsSettingsParserTest.java     |   2 +-
 11 files changed, 241 insertions(+), 115 deletions(-)
 create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt
 create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt
 create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt

diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java
index 4b53cd282..a5c216bef 100644
--- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java
+++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java
@@ -62,7 +62,6 @@
 import java.util.stream.Collectors;
 
 import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.computeVirtualPathName;
-import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isExcluded;
 import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isIndexable;
 import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.localDateTimeToDate;
 import static fr.pilato.elasticsearch.crawler.fs.tika.TikaDocParser.generate;
@@ -240,19 +239,15 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
             for (FileAbstractModel child : children) {
                 String filename = child.getName();
 
-                // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents
-                boolean isIndexable = isIndexable(filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
+                String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, filename).toString());
 
-                // It can happen that we a dir "foo" which does not match the include name like "*.txt"
-                // We need to go in it unless it has been explicitly excluded by the user
-                if (child.isDirectory() && !isExcluded(filename, fsSettings.getFs().getExcludes())) {
-                    isIndexable = true;
-                }
+                // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents
+                boolean isIndexable = isIndexable(child.isDirectory(), virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
 
-                logger.debug("[{}] can be indexed: [{}]", filename, isIndexable);
+                logger.debug("[{}] can be indexed: [{}]", virtualFileName, isIndexable);
                 if (isIndexable) {
                     if (child.isFile()) {
-                        logger.debug("  - file: {}", filename);
+                        logger.debug("  - file: {}", virtualFileName);
                         fsFiles.add(filename);
                         if (child.getLastModifiedDate().isAfter(lastScanDate) ||
                                 (child.getCreationDate() != null && child.getCreationDate().isAfter(lastScanDate))) {
@@ -300,7 +295,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
             for (String esfile : esFiles) {
                 logger.trace("Checking file [{}]", esfile);
 
-                if (isIndexable(esfile, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
+                String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfile).toString());
+                if (isIndexable(false, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
                         && !fsFiles.contains(esfile)) {
                     logger.trace("Removing file [{}] in elasticsearch", esfile);
                     esDelete(fsSettings.getElasticsearch().getIndex(), generateIdFromFilename(esfile, filepath));
@@ -314,7 +310,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
 
                 // for the delete folder
                 for (String esfolder : esFolders) {
-                    if (isIndexable(esfolder, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
+                    String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfolder).toString());
+                    if (isIndexable(true, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
                         logger.trace("Checking directory [{}]", esfolder);
                         if (!fsFolders.contains(esfolder)) {
                             logger.trace("Removing recursively directory [{}] in elasticsearch", esfolder);
diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst
index 494d4c69b..0b28f1607 100644
--- a/docs/source/admin/fs/local-fs.rst
+++ b/docs/source/admin/fs/local-fs.rst
@@ -102,6 +102,8 @@ file system and another run. Which means that if you set it to ``15m``,
 the next scan will happen on 15 minutes after the end of the current
 scan, whatever its duration.
 
+.. _includes_excludes:
+
 Includes and excludes
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -117,21 +119,59 @@ Define ``fs.includes`` and ``fs.excludes`` properties in your
      "name" : "test",
      "fs": {
        "includes": [
-         "*.doc",
-         "*.pdf"
+         "*/*.doc",
+         "*/*.pdf"
        ],
        "excludes": [
-         "resume*"
+         "*/resume*"
        ]
      }
    }
 
-It also applies to directory names. So if you want to ignore ``.ignore``
-dir, just add ``.ignore`` as an excluded name. Note that ``includes``
-does not apply to directory names but only to filenames.
 
 By default, FSCrawler will exclude files starting with ``~``.
 
+.. versionadded:: 2.5
+
+It also applies to directory names. So if you want to ignore ``.ignore``
+dir, just add ``.ignore`` as an excluded name. Note that ``includes`` and ``excludes``
+apply to directory names as well.
+
+Let's take the following example with the ``root`` dir as ``/tmp``:
+
+.. code::
+
+    /tmp
+    ├── folderA
+    │   ├── subfolderA
+    │   ├── subfolderB
+    │   └── subfolderC
+    ├── folderB
+    │   ├── subfolderA
+    │   ├── subfolderB
+    │   └── subfolderC
+    └── folderC
+        ├── subfolderA
+        ├── subfolderB
+        └── subfolderC
+
+If you define the following ``fs.excludes`` property in your
+``~/.fscrawler/test/_settings.json`` file:
+
+.. code:: json
+
+   {
+     "name" : "test",
+     "fs": {
+       "excludes": [
+         "/folderB/subfolder*"
+       ]
+     }
+   }
+
+Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and
+``/folderB/subfolderC`` will be indexed.
+
 Indexing JSon docs
 ^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 584540dac..cae321f27 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -200,55 +200,86 @@ Upgrade to 2.4
 Upgrade to 2.5
 ~~~~~~~~~~~~~~
 
--  A bug was causing a lot of data going over the wire each time
-   FSCrawler was running. To fix this issue, we changed the default
-   mapping and we set ``store: true`` on field ``file.filename``. If
-   this field is not stored and ``remove_deleted`` is ``true``
-   (default), FSCrawler will fail while crawling your documents. You
-   need to create the new mapping accordingly and reindex your existing
-   data either by deleting the old index and running again FSCrawler or
-   by using the `reindex
-   API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
-   as follows:
-
-::
-
-   # Backup old index data
-   POST _reindex
-   {
-     "source": {
-       "index": "job_name"
-     },
-     "dest": {
-       "index": "job_name_backup"
-     }
-   }
-   # Remove job_name index
-   DELETE job_name
-
-Restart FSCrawler with the following command. It will just create the
-right mapping again.
-
-.. code:: sh
-
-   $ bin/fscrawler job_name --loop 0
-
-Then restore old data:
-
-::
-
-   POST _reindex
-   {
-     "source": {
-       "index": "job_name_backup"
-     },
-     "dest": {
-       "index": "job_name"
-     }
-   }
-   # Remove backup index
-   DELETE job_name_backup
-
-The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
-Might be better to reindex your data.
+-   A bug was causing a lot of data going over the wire each time
+    FSCrawler was running. To fix this issue, we changed the default
+    mapping and we set ``store: true`` on field ``file.filename``. If
+    this field is not stored and ``remove_deleted`` is ``true``
+    (default), FSCrawler will fail while crawling your documents. You
+    need to create the new mapping accordingly and reindex your existing
+    data either by deleting the old index and running again FSCrawler or
+    by using the `reindex
+    API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
+    as follows:
+
+    ::
+
+       # Backup old index data
+       POST _reindex
+       {
+         "source": {
+           "index": "job_name"
+         },
+         "dest": {
+           "index": "job_name_backup"
+         }
+       }
+       # Remove job_name index
+       DELETE job_name
+
+    Restart FSCrawler with the following command. It will just create the
+    right mapping again.
+
+    .. code:: sh
+
+       $ bin/fscrawler job_name --loop 0
+
+    Then restore old data:
+
+    ::
+
+       POST _reindex
+       {
+         "source": {
+           "index": "job_name_backup"
+         },
+         "dest": {
+           "index": "job_name"
+         }
+       }
+       # Remove backup index
+       DELETE job_name_backup
+
+    The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
+    Might be better to reindex your data.
+
+-   The ``excludes`` parameter is also used for directory names. But this
+    new implementation also brings a breaking change if you were using ``excludes``
+    previously. In the previous implementation, the regular expression was only applied
+    to the filename. It's now applied to the full virtual path name.
+
+    For example if you have a ``/tmp`` dir as follows:
+
+    .. code::
+
+        /tmp
+        └── folder
+            ├── foo.txt
+            └── bar.txt
+
+    Previously excluding ``foo.txt`` was excluding the virtual file ``/folder/foo.txt``.
+    If you still want to exclude any file named ``foo.txt`` whatever its directory
+    you now need to specify ``*/foo.txt``:
+
+    .. code:: json
+
+       {
+         "name" : "test",
+         "fs": {
+           "excludes": [
+             "*/foo.txt"
+           ]
+         }
+       }
+
+    For more information, read :ref:`includes_excludes`.
 
diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java
index d0b54cb14..c374236a0 100644
--- a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java
+++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java
@@ -127,15 +127,35 @@ public static String readJsonFile(Path dir, Path config, String version, String
      * @param includes include rules, may be empty not null
      * @param excludes exclude rules, may be empty not null
      */
-    public static boolean isIndexable(String filename, List<String> includes, List<String> excludes) {
-        logger.debug("filename = [{}], includes = [{}], excludes = [{}]", filename, includes, excludes);
-
+    private static boolean isIndexable(String filename, List<String> includes, List<String> excludes) {
         boolean excluded = isExcluded(filename, excludes);
         if (excluded) return false;
 
         return isIncluded(filename, includes);
     }
 
+    /**
+     * We check if we can index the file or if we should ignore it
+     *
+     * @param directory true if the current file is a directory, false in other case (actual file)
+     * @param filename The filename to scan
+     * @param includes include rules, may be empty not null
+     * @param excludes exclude rules, may be empty not null
+     */
+    public static boolean isIndexable(boolean directory, String filename, List<String> includes, List<String> excludes) {
+        logger.debug("directory = [{}], filename = [{}], includes = [{}], excludes = [{}]", directory, filename, includes, excludes);
+
+        boolean isIndexable = isIndexable(filename, includes, excludes);
+
+        // It can happen that we a dir "foo" which does not match the include name like "*.txt"
+        // We need to go in it unless it has been explicitly excluded by the user
+        if (directory && !isExcluded(filename, excludes)) {
+            isIndexable = true;
+        }
+
+        return isIndexable;
+    }
+
     /**
      * We check if we can index the file or if we should ignore it
      *
diff --git a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java
index 40317d49b..c8b347de2 100644
--- a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java
+++ b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java
@@ -30,7 +30,7 @@ public class FsCrawlerTestIncludesIT extends AbstractFsCrawlerITCase {
     @Test
     public void test_includes() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addInclude("*_include.txt")
+                .addInclude("*/*_include\\.txt")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
         countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
@@ -39,7 +39,7 @@ public void test_includes() throws Exception {
     @Test
     public void test_subdirs_with_patterns() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addInclude("*.txt")
+                .addInclude("*/*\\.txt")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
 
@@ -50,11 +50,12 @@ public void test_subdirs_with_patterns() throws Exception {
     @Test
     public void test_ignore_dir() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addExclude(".ignore")
+                .addExclude("*/\\.ignore")
+                .addExclude("/subdir/sub*")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
 
         // We expect to have one file
-        countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
+        countTestHelper(new SearchRequest(getCrawlerName()), 2L, null);
     }
 }
diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt
new file mode 100644
index 000000000..16020bf6a
--- /dev/null
+++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
\ No newline at end of file
diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt
new file mode 100644
index 000000000..16020bf6a
--- /dev/null
+++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
\ No newline at end of file
diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt
new file mode 100644
index 000000000..16020bf6a
--- /dev/null
+++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
\ No newline at end of file
diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java
index 07fa5a2eb..b745d44e6 100644
--- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java
+++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java
@@ -55,7 +55,7 @@ public static Builder builder() {
     }
 
     public static final String DEFAULT_DIR = "/tmp/es";
-    public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("~*");
+    public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("*/~*");
     public static final Fs DEFAULT = Fs.builder().setUrl(DEFAULT_DIR).setExcludes(DEFAULT_EXCLUDED).build();
 
     public static class Builder {
diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java
index c7330b06a..6fdcd4af8 100644
--- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java
+++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java
@@ -35,61 +35,95 @@ public class FsMatchFilesTest extends AbstractFSCrawlerTestCase {
 
     @Test
     public void exclude_only() {
-        assertThat(isIndexable("test.doc", new ArrayList<>(), Collections.singletonList("*.doc")), is(false));
-        assertThat(isIndexable("test.xls", new ArrayList<>(), Collections.singletonList("*.doc")), is(true));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("*.doc")), is(true));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Arrays.asList("*.doc", "*.xls")), is(false));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("my.d?c*.xls")), is(false));
-        assertThat(isIndexable("my.douc.xls", new ArrayList<>(), Collections.singletonList("my.d?c*.xls")), is(true));
-        assertThat(isIndexable(".snapshots", new ArrayList<>(), Collections.singletonList(".snapshots")), is(false));
-        assertThat(isIndexable("doc.doc", new ArrayList<>(), Arrays.asList("*.pdf", "*.xls", "*.doc")), is(false));
-        assertThat(isIndexable("doc.ppt", new ArrayList<>(), Arrays.asList("*.pdf", "*.xls", "*.doc")), is(true));
+        assertThat(isIndexable(false, "/test.doc", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(false));
+        assertThat(isIndexable(false, "/test.xls", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Arrays.asList("*/*.doc", "*/*.xls")), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/my.d?c*.xls")), is(false));
+        assertThat(isIndexable(false, "/my.douc.xls", new ArrayList<>(), Collections.singletonList("*/my.d?c*.xls")), is(true));
+        assertThat(isIndexable(false, "/.snapshots", new ArrayList<>(), Collections.singletonList("*/.snapshots")), is(false));
+        assertThat(isIndexable(false, "/doc.doc", new ArrayList<>(), Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc")), is(false));
+        assertThat(isIndexable(false, "/doc.ppt", new ArrayList<>(), Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc")), is(true));
     }
 
     @Test
     public void include_only() {
-        assertThat(isIndexable("test.doc", Collections.singletonList("*.doc"), new ArrayList<>()), is(true));
-        assertThat(isIndexable("test.xls", Collections.singletonList("*.doc"), new ArrayList<>()), is(false));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.doc"), new ArrayList<>()), is(false));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("my.d?c*.xls"), new ArrayList<>()), is(true));
-        assertThat(isIndexable("my.douc.xls", Collections.singletonList("my.d?c*.xls"), new ArrayList<>()), is(false));
-        assertThat(isIndexable("doc.doc", Arrays.asList("*.pdf", "*.xls", "*.doc"), new ArrayList<>()), is(true));
-        assertThat(isIndexable("doc.ppt", Arrays.asList("*.pdf", "*.xls", "*.doc"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(true));
+        assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/my.d?c*.xls"), new ArrayList<>()), is(true));
+        assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/my.d?c*.xls"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/doc.doc", Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc"), new ArrayList<>()), is(true));
+        assertThat(isIndexable(false, "/doc.ppt", Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc"), new ArrayList<>()), is(false));
     }
 
     @Test
     public void include_exclude() {
-        assertThat(isIndexable("test.doc", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(false));
-        assertThat(isIndexable("test.xls", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(true));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(true));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.xls"), Collections.singletonList("my.d?c*.xls")), is(false));
-        assertThat(isIndexable("my.douc.xls", Collections.singletonList("*.xls"), Collections.singletonList("my.d?c*.xls")), is(true));
+        assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(false));
+        assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/my.d?c*.xls")), is(false));
+        assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/my.d?c*.xls")), is(true));
     }
 
     @Test
     public void default_ignored_file() {
-        assertThat(isIndexable("~mydoc", new ArrayList<>(), DEFAULT_EXCLUDED), is(false));
-        assertThat(isIndexable("~", new ArrayList<>(), DEFAULT_EXCLUDED), is(false));
-        assertThat(isIndexable("adoc.doc", new ArrayList<>(), DEFAULT_EXCLUDED), is(true));
-        assertThat(isIndexable("mydoc~", new ArrayList<>(), DEFAULT_EXCLUDED), is(true));
+        assertThat(isIndexable(false, "/~mydoc", new ArrayList<>(), DEFAULT_EXCLUDED), is(false));
+        assertThat(isIndexable(false, "/~", new ArrayList<>(), DEFAULT_EXCLUDED), is(false));
+        assertThat(isIndexable(false, "/adoc.doc", new ArrayList<>(), DEFAULT_EXCLUDED), is(true));
+        assertThat(isIndexable(false, "/mydoc~", new ArrayList<>(), DEFAULT_EXCLUDED), is(true));
     }
 
     @Test
     public void case_sensitive() {
         // Excludes
-        assertThat(isIndexable("test.doc", new ArrayList<>(), Collections.singletonList("*.DOC")), is(false));
-        assertThat(isIndexable("test.xls", new ArrayList<>(), Collections.singletonList("*.DOC")), is(true));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("*.DOC")), is(true));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Arrays.asList("*.DOC", "*.XLS")), is(false));
-        assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("MY.D?C*.XLS")), is(false));
-        assertThat(isIndexable("my.douc.xls", new ArrayList<>(), Collections.singletonList("MY.d?C*.XLS")), is(true));
-        assertThat(isIndexable(".snapshots", new ArrayList<>(), Collections.singletonList(".SNAPSHOTS")), is(false));
+        assertThat(isIndexable(false, "/test.doc", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(false));
+        assertThat(isIndexable(false, "/test.xls", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(true));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Arrays.asList("*/*.DOC", "*/*.XLS")), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/MY.D?C*.XLS")), is(false));
+        assertThat(isIndexable(false, "/my.douc.xls", new ArrayList<>(), Collections.singletonList("*/MY.d?C*.XLS")), is(true));
+        assertThat(isIndexable(false, "/.snapshots", new ArrayList<>(), Collections.singletonList("*/.SNAPSHOTS")), is(false));
 
         // Includes
-        assertThat(isIndexable("test.doc", Collections.singletonList("*.DOC"), new ArrayList<>()), is(true));
-        assertThat(isIndexable("test.xls", Collections.singletonList("*.DOC"), new ArrayList<>()), is(false));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.DOC"), new ArrayList<>()), is(false));
-        assertThat(isIndexable("my.doc.xls", Collections.singletonList("MY.D?C*.XLS"), new ArrayList<>()), is(true));
-        assertThat(isIndexable("my.douc.xls", Collections.singletonList("MY.D?C*.XLS"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(true));
+        assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(false));
+        assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/MY.D?C*.XLS"), new ArrayList<>()), is(true));
+        assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/MY.D?C*.XLS"), new ArrayList<>()), is(false));
+    }
+    
+    @Test
+    public void directories() {
+        assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false));
+        assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false));
+        assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false));
+        assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true));
+        assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(false));
+        assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
+        assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true));
     }
 }
diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java
index 3f30d2f0b..dc631979b 100644
--- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java
+++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java
@@ -126,7 +126,7 @@ public void testWithSimplestJsonJobFile() throws IOException {
         assertThat(settings.getFs(), notNullValue());
         assertThat(settings.getFs().getChecksum(), nullValue());
         assertThat(settings.getFs().getIncludes(), nullValue());
-        assertThat(settings.getFs().getExcludes(), contains("~*"));
+        assertThat(settings.getFs().getExcludes(), contains("*/~*"));
         assertThat(settings.getFs().getIndexedChars(), nullValue());
         assertThat(settings.getFs().getUpdateRate(), is(TimeValue.timeValueMinutes(15)));
         assertThat(settings.getFs().getUrl(), is("/tmp/es"));

From 812fe7dca312a911dccb31320dcac1d50dc44783 Mon Sep 17 00:00:00 2001
From: David Pilato <david@pilato.fr>
Date: Sat, 28 Jul 2018 10:17:48 +0200
Subject: [PATCH 2/2] Update to elasticsearch 6.3.2

Related to #569.
---
 docs/source/fscrawler.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/fscrawler.ini b/docs/source/fscrawler.ini
index 3da937ad2..2ec1e481c 100644
--- a/docs/source/fscrawler.ini
+++ b/docs/source/fscrawler.ini
@@ -3,7 +3,7 @@ Version=2.5-SNAPSHOT
 
 [3rdParty]
 TikaVersion=1.18
-ElasticsearchVersion=6.3.1
+ElasticsearchVersion=6.3.2
 LevigoVersion=2.0
 TiffVersion=1.3.1
 JpegVersion=1.3.0