Fix the implementation

This is now excluding the directories we don't want to include.
dadoonet · Jul 23, 2018 · 0a86ec2 · 0a86ec2
1 parent 4d5b7d3
commit 0a86ec2
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 67 deletions.
diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java
@@ -239,13 +239,15 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
             for (FileAbstractModel child : children) {
                 String filename = child.name;
 
+                String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, filename).toString());
+
                 // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents
-                boolean isIndexable = isIndexable(child.directory, filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
+                boolean isIndexable = isIndexable(child.directory, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
 
-                logger.debug("[{}] can be indexed: [{}]", filename, isIndexable);
+                logger.debug("[{}] can be indexed: [{}]", virtualFileName, isIndexable);
                 if (isIndexable) {
                     if (child.file) {
-                        logger.debug("  - file: {}", filename);
+                        logger.debug("  - file: {}", virtualFileName);
                         fsFiles.add(filename);
                         if (child.lastModifiedDate.isAfter(lastScanDate) ||
                                 (child.creationDate != null && child.creationDate.isAfter(lastScanDate))) {
@@ -293,7 +295,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
             for (String esfile : esFiles) {
                 logger.trace("Checking file [{}]", esfile);
 
-                if (isIndexable(false, esfile, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
+                String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfile).toString());
+                if (isIndexable(false, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
                         && !fsFiles.contains(esfile)) {
                     logger.trace("Removing file [{}] in elasticsearch", esfile);
                     esDelete(fsSettings.getElasticsearch().getIndex(), generateIdFromFilename(esfile, filepath));
@@ -307,7 +310,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
 
                 // for the delete folder
                 for (String esfolder : esFolders) {
-                    if (isIndexable(true, esfolder, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
+                    String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfolder).toString());
+                    if (isIndexable(true, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
                         logger.trace("Checking directory [{}]", esfolder);
                         if (!fsFolders.contains(esfolder)) {
                             logger.trace("Removing recursively directory [{}] in elasticsearch", esfolder);

diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst
@@ -102,6 +102,8 @@ file system and another run. Which means that if you set it to ``15m``,
 the next scan will happen on 15 minutes after the end of the current
 scan, whatever its duration.
 
+.. _includes_excludes:
+
 Includes and excludes
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -117,21 +119,59 @@ Define ``fs.includes`` and ``fs.excludes`` properties in your
      "name" : "test",
      "fs": {
        "includes": [
-         "*.doc",
-         "*.pdf"
+         "*/*.doc",
+         "*/*.pdf"
        ],
        "excludes": [
-         "resume*"
+         "*/resume*"
        ]
      }
    }
 
-It also applies to directory names. So if you want to ignore ``.ignore``
-dir, just add ``.ignore`` as an excluded name. Note that ``includes``
-does not apply to directory names but only to filenames.
 
 By default, FSCrawler will exclude files starting with ``~``.
 
+.. versionadded:: 2.5
+
+It also applies to directory names. So if you want to ignore ``.ignore``
+dir, just add ``.ignore`` as an excluded name. Note that ``includes`` and ``excludes``
+apply to directory names as well.
+
+Let's take the following example with the ``root`` dir as ``/tmp``:
+
+.. code::
+
+    /tmp
+    ├── folderA
+    │   ├── subfolderA
+    │   ├── subfolderB
+    │   └── subfolderC
+    ├── folderB
+    │   ├── subfolderA
+    │   ├── subfolderB
+    │   └── subfolderC
+    └── folderC
+        ├── subfolderA
+        ├── subfolderB
+        └── subfolderC
+
+If you define the following ``fs.excludes`` property in your
+``~/.fscrawler/test/_settings.json`` file:
+
+.. code:: json
+
+   {
+     "name" : "test",
+     "fs": {
+       "excludes": [
+         "/folderB/subfolder*"
+       ]
+     }
+   }
+
+Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and
+``/folderB/subfolderC`` will be indexed.
+
 Indexing JSon docs
 ^^^^^^^^^^^^^^^^^^
 

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -200,55 +200,86 @@ Upgrade to 2.4
 Upgrade to 2.5
 ~~~~~~~~~~~~~~
 
--  A bug was causing a lot of data going over the wire each time
-   FSCrawler was running. To fix this issue, we changed the default
-   mapping and we set ``store: true`` on field ``file.filename``. If
-   this field is not stored and ``remove_deleted`` is ``true``
-   (default), FSCrawler will fail while crawling your documents. You
-   need to create the new mapping accordingly and reindex your existing
-   data either by deleting the old index and running again FSCrawler or
-   by using the `reindex
-   API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
-   as follows:
-
-::
-
-   # Backup old index data
-   POST _reindex
-   {
-     "source": {
-       "index": "job_name"
-     },
-     "dest": {
-       "index": "job_name_backup"
-     }
-   }
-   # Remove job_name index
-   DELETE job_name
-
-Restart FSCrawler with the following command. It will just create the
-right mapping again.
-
-.. code:: sh
-
-   $ bin/fscrawler job_name --loop 0
-
-Then restore old data:
-
-::
-
-   POST _reindex
-   {
-     "source": {
-       "index": "job_name_backup"
-     },
-     "dest": {
-       "index": "job_name"
-     }
-   }
-   # Remove backup index
-   DELETE job_name_backup
-
-The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
-Might be better to reindex your data.
+-   A bug was causing a lot of data going over the wire each time
+    FSCrawler was running. To fix this issue, we changed the default
+    mapping and we set ``store: true`` on field ``file.filename``. If
+    this field is not stored and ``remove_deleted`` is ``true``
+    (default), FSCrawler will fail while crawling your documents. You
+    need to create the new mapping accordingly and reindex your existing
+    data either by deleting the old index and running again FSCrawler or
+    by using the `reindex
+    API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
+    as follows:
+
+    ::
+
+       # Backup old index data
+       POST _reindex
+       {
+         "source": {
+           "index": "job_name"
+         },
+         "dest": {
+           "index": "job_name_backup"
+         }
+       }
+       # Remove job_name index
+       DELETE job_name
+
+    Restart FSCrawler with the following command. It will just create the
+    right mapping again.
+
+    .. code:: sh
+
+       $ bin/fscrawler job_name --loop 0
+
+    Then restore old data:
+
+    ::
+
+       POST _reindex
+       {
+         "source": {
+           "index": "job_name_backup"
+         },
+         "dest": {
+           "index": "job_name"
+         }
+       }
+       # Remove backup index
+       DELETE job_name_backup
+
+    The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
+    Might be better to reindex your data.
+
+-   The ``excludes`` parameter is also used for directory names. But this
+    new implementation also brings a breaking change if you were using ``excludes``
+    previously. In the previous implementation, the regular expression was only applied
+    to the filename. It's now applied to the full virtual path name.
+
+    For example if you have a ``/tmp`` dir as follows:
+
+    .. code::
+
+        /tmp
+        └── folder
+            ├── foo.txt
+            └── bar.txt
+
+    Previously excluding ``foo.txt`` was excluding the virtual file ``/folder/foo.txt``.
+    If you still want to exclude any file named ``foo.txt`` whatever its directory
+    you now need to specify ``*/foo.txt``:
+
+    .. code:: json
+
+       {
+         "name" : "test",
+         "fs": {
+           "excludes": [
+             "*/foo.txt"
+           ]
+         }
+       }
+
+    For more information, read :ref:`includes_excludes`.
 
diff --git a/...est/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java b/...est/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java
@@ -30,7 +30,7 @@ public class FsCrawlerTestIncludesIT extends AbstractFsCrawlerITCase {
     @Test
     public void test_includes() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addInclude("*_include.txt")
+                .addInclude("*/*_include\\.txt")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
         countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
@@ -39,7 +39,7 @@ public void test_includes() throws Exception {
     @Test
     public void test_subdirs_with_patterns() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addInclude("*.txt")
+                .addInclude("*/*\\.txt")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
 
@@ -50,11 +50,12 @@ public void test_subdirs_with_patterns() throws Exception {
     @Test
     public void test_ignore_dir() throws Exception {
         Fs fs = startCrawlerDefinition()
-                .addExclude(".ignore")
+                .addExclude("*/\\.ignore")
+                .addExclude("/subdir/sub*")
                 .build();
         startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
 
         // We expect to have one file
-        countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
+        countTestHelper(new SearchRequest(getCrawlerName()), 2L, null);
     }
 }
diff --git a/...ion-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt b/...ion-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
diff --git a/...ation-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt b/...ation-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
diff --git a/...ation-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt b/...ation-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt
@@ -0,0 +1 @@
+This file contains some words.
diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java
@@ -55,7 +55,7 @@ public static Builder builder() {
     }
 
     public static final String DEFAULT_DIR = "/tmp/es";
-    public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("~*");
+    public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("*/~*");
     public static final Fs DEFAULT = Fs.builder().setUrl(DEFAULT_DIR).setExcludes(DEFAULT_EXCLUDED).build();
 
     public static class Builder {