From ac63803916010232fb1ddf63232f0936d67fa992 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Mon, 23 Jul 2018 15:59:43 +0200 Subject: [PATCH 1/2] Exclude dirs depending on dir full name (relative to root) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For example if I have a directory /mypath/folder with subdirectories ``` /mypath/folder ├── folderA │   ├── subfolderA │   ├── subfolderB │   └── subfolderC ├── folderB │   ├── subfolderA │   ├── subfolderB │   └── subfolderC └── folderC ├── subfolderA ├── subfolderB └── subfolderC ``` I would like to be able to start crawling at `/mypath/folder` and crawl everything except `/folderB/subfolderB` for example. I would like to be able to put `"excludes": ["/folderB/subfolderB"]` or even a wildcard like `"excludes": ["/folderB/subfolder*"]`. Closes #553. --- .../elasticsearch/crawler/fs/FsParser.java | 21 ++- docs/source/admin/fs/local-fs.rst | 52 ++++++- docs/source/installation.rst | 133 +++++++++++------- .../crawler/fs/framework/FsCrawlerUtil.java | 26 +++- .../integration/FsCrawlerTestIncludesIT.java | 9 +- .../subdir/notsub/roottxtfile.txt | 1 + .../subdir/sub1/roottxtfile.txt | 1 + .../subdir/sub2/roottxtfile.txt | 1 + .../elasticsearch/crawler/fs/settings/Fs.java | 2 +- .../crawler/fs/settings/FsMatchFilesTest.java | 108 +++++++++----- .../fs/settings/FsSettingsParserTest.java | 2 +- 11 files changed, 241 insertions(+), 115 deletions(-) create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt create mode 100644 integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java index 4b53cd282..a5c216bef 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java @@ -62,7 +62,6 @@ import java.util.stream.Collectors; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.computeVirtualPathName; -import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isExcluded; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.isIndexable; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.localDateTimeToDate; import static fr.pilato.elasticsearch.crawler.fs.tika.TikaDocParser.generate; @@ -240,19 +239,15 @@ private void addFilesRecursively(FileAbstractor path, String filepath, LocalD for (FileAbstractModel child : children) { String filename = child.getName(); - // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents - boolean isIndexable = isIndexable(filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes()); + String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, filename).toString()); - // It can happen that we a dir "foo" which does not match the include name like "*.txt" - // We need to go in it unless it has been explicitly excluded by the user - if (child.isDirectory() && !isExcluded(filename, fsSettings.getFs().getExcludes())) { - isIndexable = true; - } + // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents + boolean isIndexable = isIndexable(child.isDirectory(), virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes()); - logger.debug("[{}] can be indexed: [{}]", filename, isIndexable); + logger.debug("[{}] can be indexed: [{}]", virtualFileName, isIndexable); if (isIndexable) { if (child.isFile()) { - logger.debug(" - file: {}", filename); + logger.debug(" - file: {}", virtualFileName); fsFiles.add(filename); if (child.getLastModifiedDate().isAfter(lastScanDate) || (child.getCreationDate() != null && child.getCreationDate().isAfter(lastScanDate))) { @@ -300,7 +295,8 @@ private void addFilesRecursively(FileAbstractor path, String filepath, LocalD for (String esfile : esFiles) { logger.trace("Checking file [{}]", esfile); - if (isIndexable(esfile, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes()) + String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfile).toString()); + if (isIndexable(false, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes()) && !fsFiles.contains(esfile)) { logger.trace("Removing file [{}] in elasticsearch", esfile); esDelete(fsSettings.getElasticsearch().getIndex(), generateIdFromFilename(esfile, filepath)); @@ -314,7 +310,8 @@ private void addFilesRecursively(FileAbstractor path, String filepath, LocalD // for the delete folder for (String esfolder : esFolders) { - if (isIndexable(esfolder, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) { + String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfolder).toString()); + if (isIndexable(true, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) { logger.trace("Checking directory [{}]", esfolder); if (!fsFolders.contains(esfolder)) { logger.trace("Removing recursively directory [{}] in elasticsearch", esfolder); diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index 494d4c69b..0b28f1607 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -102,6 +102,8 @@ file system and another run. Which means that if you set it to ``15m``, the next scan will happen on 15 minutes after the end of the current scan, whatever its duration. +.. _includes_excludes: + Includes and excludes ^^^^^^^^^^^^^^^^^^^^^ @@ -117,21 +119,59 @@ Define ``fs.includes`` and ``fs.excludes`` properties in your "name" : "test", "fs": { "includes": [ - "*.doc", - "*.pdf" + "*/*.doc", + "*/*.pdf" ], "excludes": [ - "resume*" + "*/resume*" ] } } -It also applies to directory names. So if you want to ignore ``.ignore`` -dir, just add ``.ignore`` as an excluded name. Note that ``includes`` -does not apply to directory names but only to filenames. By default, FSCrawler will exclude files starting with ``~``. +.. versionadded:: 2.5 + +It also applies to directory names. So if you want to ignore ``.ignore`` +dir, just add ``.ignore`` as an excluded name. Note that ``includes`` and ``excludes`` +apply to directory names as well. + +Let's take the following example with the ``root`` dir as ``/tmp``: + +.. code:: + + /tmp + ├── folderA + │ ├── subfolderA + │ ├── subfolderB + │ └── subfolderC + ├── folderB + │ ├── subfolderA + │ ├── subfolderB + │ └── subfolderC + └── folderC + ├── subfolderA + ├── subfolderB + └── subfolderC + +If you define the following ``fs.excludes`` property in your +``~/.fscrawler/test/_settings.json`` file: + +.. code:: json + + { + "name" : "test", + "fs": { + "excludes": [ + "/folderB/subfolder*" + ] + } + } + +Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and +``/folderB/subfolderC`` will be indexed. + Indexing JSon docs ^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 584540dac..cae321f27 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -200,55 +200,86 @@ Upgrade to 2.4 Upgrade to 2.5 ~~~~~~~~~~~~~~ -- A bug was causing a lot of data going over the wire each time - FSCrawler was running. To fix this issue, we changed the default - mapping and we set ``store: true`` on field ``file.filename``. If - this field is not stored and ``remove_deleted`` is ``true`` - (default), FSCrawler will fail while crawling your documents. You - need to create the new mapping accordingly and reindex your existing - data either by deleting the old index and running again FSCrawler or - by using the `reindex - API `__ - as follows: - -:: - - # Backup old index data - POST _reindex - { - "source": { - "index": "job_name" - }, - "dest": { - "index": "job_name_backup" - } - } - # Remove job_name index - DELETE job_name - -Restart FSCrawler with the following command. It will just create the -right mapping again. - -.. code:: sh - - $ bin/fscrawler job_name --loop 0 - -Then restore old data: - -:: - - POST _reindex - { - "source": { - "index": "job_name_backup" - }, - "dest": { - "index": "job_name" - } - } - # Remove backup index - DELETE job_name_backup - -The default mapping changed for FSCrawler for ``meta.raw.*`` fields. -Might be better to reindex your data. +- A bug was causing a lot of data going over the wire each time + FSCrawler was running. To fix this issue, we changed the default + mapping and we set ``store: true`` on field ``file.filename``. If + this field is not stored and ``remove_deleted`` is ``true`` + (default), FSCrawler will fail while crawling your documents. You + need to create the new mapping accordingly and reindex your existing + data either by deleting the old index and running again FSCrawler or + by using the `reindex + API `__ + as follows: + + :: + + # Backup old index data + POST _reindex + { + "source": { + "index": "job_name" + }, + "dest": { + "index": "job_name_backup" + } + } + # Remove job_name index + DELETE job_name + + Restart FSCrawler with the following command. It will just create the + right mapping again. + + .. code:: sh + + $ bin/fscrawler job_name --loop 0 + + Then restore old data: + + :: + + POST _reindex + { + "source": { + "index": "job_name_backup" + }, + "dest": { + "index": "job_name" + } + } + # Remove backup index + DELETE job_name_backup + + The default mapping changed for FSCrawler for ``meta.raw.*`` fields. + Might be better to reindex your data. + +- The ``excludes`` parameter is also used for directory names. But this + new implementation also brings a breaking change if you were using ``excludes`` + previously. In the previous implementation, the regular expression was only applied + to the filename. It's now applied to the full virtual path name. + + For example if you have a ``/tmp`` dir as follows: + + .. code:: + + /tmp + └── folder + ├── foo.txt + └── bar.txt + + Previously excluding ``foo.txt`` was excluding the virtual file ``/folder/foo.txt``. + If you still want to exclude any file named ``foo.txt`` whatever its directory + you now need to specify ``*/foo.txt``: + + .. code:: json + + { + "name" : "test", + "fs": { + "excludes": [ + "*/foo.txt" + ] + } + } + + For more information, read :ref:`includes_excludes`. diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java index d0b54cb14..c374236a0 100644 --- a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java +++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java @@ -127,15 +127,35 @@ public static String readJsonFile(Path dir, Path config, String version, String * @param includes include rules, may be empty not null * @param excludes exclude rules, may be empty not null */ - public static boolean isIndexable(String filename, List includes, List excludes) { - logger.debug("filename = [{}], includes = [{}], excludes = [{}]", filename, includes, excludes); - + private static boolean isIndexable(String filename, List includes, List excludes) { boolean excluded = isExcluded(filename, excludes); if (excluded) return false; return isIncluded(filename, includes); } + /** + * We check if we can index the file or if we should ignore it + * + * @param directory true if the current file is a directory, false in other case (actual file) + * @param filename The filename to scan + * @param includes include rules, may be empty not null + * @param excludes exclude rules, may be empty not null + */ + public static boolean isIndexable(boolean directory, String filename, List includes, List excludes) { + logger.debug("directory = [{}], filename = [{}], includes = [{}], excludes = [{}]", directory, filename, includes, excludes); + + boolean isIndexable = isIndexable(filename, includes, excludes); + + // It can happen that we a dir "foo" which does not match the include name like "*.txt" + // We need to go in it unless it has been explicitly excluded by the user + if (directory && !isExcluded(filename, excludes)) { + isIndexable = true; + } + + return isIndexable; + } + /** * We check if we can index the file or if we should ignore it * diff --git a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java index 40317d49b..c8b347de2 100644 --- a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java +++ b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestIncludesIT.java @@ -30,7 +30,7 @@ public class FsCrawlerTestIncludesIT extends AbstractFsCrawlerITCase { @Test public void test_includes() throws Exception { Fs fs = startCrawlerDefinition() - .addInclude("*_include.txt") + .addInclude("*/*_include\\.txt") .build(); startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); countTestHelper(new SearchRequest(getCrawlerName()), 1L, null); @@ -39,7 +39,7 @@ public void test_includes() throws Exception { @Test public void test_subdirs_with_patterns() throws Exception { Fs fs = startCrawlerDefinition() - .addInclude("*.txt") + .addInclude("*/*\\.txt") .build(); startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); @@ -50,11 +50,12 @@ public void test_subdirs_with_patterns() throws Exception { @Test public void test_ignore_dir() throws Exception { Fs fs = startCrawlerDefinition() - .addExclude(".ignore") + .addExclude("*/\\.ignore") + .addExclude("/subdir/sub*") .build(); startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); // We expect to have one file - countTestHelper(new SearchRequest(getCrawlerName()), 1L, null); + countTestHelper(new SearchRequest(getCrawlerName()), 2L, null); } } diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt new file mode 100644 index 000000000..16020bf6a --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/notsub/roottxtfile.txt @@ -0,0 +1 @@ +This file contains some words. \ No newline at end of file diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt new file mode 100644 index 000000000..16020bf6a --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub1/roottxtfile.txt @@ -0,0 +1 @@ +This file contains some words. \ No newline at end of file diff --git a/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt new file mode 100644 index 000000000..16020bf6a --- /dev/null +++ b/integration-tests/src/test/resources-binary/samples/test_ignore_dir/subdir/sub2/roottxtfile.txt @@ -0,0 +1 @@ +This file contains some words. \ No newline at end of file diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 07fa5a2eb..b745d44e6 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -55,7 +55,7 @@ public static Builder builder() { } public static final String DEFAULT_DIR = "/tmp/es"; - public static final List DEFAULT_EXCLUDED = Collections.singletonList("~*"); + public static final List DEFAULT_EXCLUDED = Collections.singletonList("*/~*"); public static final Fs DEFAULT = Fs.builder().setUrl(DEFAULT_DIR).setExcludes(DEFAULT_EXCLUDED).build(); public static class Builder { diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java index c7330b06a..6fdcd4af8 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsMatchFilesTest.java @@ -35,61 +35,95 @@ public class FsMatchFilesTest extends AbstractFSCrawlerTestCase { @Test public void exclude_only() { - assertThat(isIndexable("test.doc", new ArrayList<>(), Collections.singletonList("*.doc")), is(false)); - assertThat(isIndexable("test.xls", new ArrayList<>(), Collections.singletonList("*.doc")), is(true)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("*.doc")), is(true)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Arrays.asList("*.doc", "*.xls")), is(false)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("my.d?c*.xls")), is(false)); - assertThat(isIndexable("my.douc.xls", new ArrayList<>(), Collections.singletonList("my.d?c*.xls")), is(true)); - assertThat(isIndexable(".snapshots", new ArrayList<>(), Collections.singletonList(".snapshots")), is(false)); - assertThat(isIndexable("doc.doc", new ArrayList<>(), Arrays.asList("*.pdf", "*.xls", "*.doc")), is(false)); - assertThat(isIndexable("doc.ppt", new ArrayList<>(), Arrays.asList("*.pdf", "*.xls", "*.doc")), is(true)); + assertThat(isIndexable(false, "/test.doc", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(false)); + assertThat(isIndexable(false, "/test.xls", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/*.doc")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Arrays.asList("*/*.doc", "*/*.xls")), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/my.d?c*.xls")), is(false)); + assertThat(isIndexable(false, "/my.douc.xls", new ArrayList<>(), Collections.singletonList("*/my.d?c*.xls")), is(true)); + assertThat(isIndexable(false, "/.snapshots", new ArrayList<>(), Collections.singletonList("*/.snapshots")), is(false)); + assertThat(isIndexable(false, "/doc.doc", new ArrayList<>(), Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc")), is(false)); + assertThat(isIndexable(false, "/doc.ppt", new ArrayList<>(), Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc")), is(true)); } @Test public void include_only() { - assertThat(isIndexable("test.doc", Collections.singletonList("*.doc"), new ArrayList<>()), is(true)); - assertThat(isIndexable("test.xls", Collections.singletonList("*.doc"), new ArrayList<>()), is(false)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.doc"), new ArrayList<>()), is(false)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("my.d?c*.xls"), new ArrayList<>()), is(true)); - assertThat(isIndexable("my.douc.xls", Collections.singletonList("my.d?c*.xls"), new ArrayList<>()), is(false)); - assertThat(isIndexable("doc.doc", Arrays.asList("*.pdf", "*.xls", "*.doc"), new ArrayList<>()), is(true)); - assertThat(isIndexable("doc.ppt", Arrays.asList("*.pdf", "*.xls", "*.doc"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(true)); + assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.doc"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/my.d?c*.xls"), new ArrayList<>()), is(true)); + assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/my.d?c*.xls"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/doc.doc", Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc"), new ArrayList<>()), is(true)); + assertThat(isIndexable(false, "/doc.ppt", Arrays.asList("*/*.pdf", "*/*.xls", "*/*.doc"), new ArrayList<>()), is(false)); } @Test public void include_exclude() { - assertThat(isIndexable("test.doc", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(false)); - assertThat(isIndexable("test.xls", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(true)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.xls"), Collections.singletonList("*.doc")), is(true)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.xls"), Collections.singletonList("my.d?c*.xls")), is(false)); - assertThat(isIndexable("my.douc.xls", Collections.singletonList("*.xls"), Collections.singletonList("my.d?c*.xls")), is(true)); + assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(false)); + assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/*.doc")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/my.d?c*.xls")), is(false)); + assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/*.xls"), Collections.singletonList("*/my.d?c*.xls")), is(true)); } @Test public void default_ignored_file() { - assertThat(isIndexable("~mydoc", new ArrayList<>(), DEFAULT_EXCLUDED), is(false)); - assertThat(isIndexable("~", new ArrayList<>(), DEFAULT_EXCLUDED), is(false)); - assertThat(isIndexable("adoc.doc", new ArrayList<>(), DEFAULT_EXCLUDED), is(true)); - assertThat(isIndexable("mydoc~", new ArrayList<>(), DEFAULT_EXCLUDED), is(true)); + assertThat(isIndexable(false, "/~mydoc", new ArrayList<>(), DEFAULT_EXCLUDED), is(false)); + assertThat(isIndexable(false, "/~", new ArrayList<>(), DEFAULT_EXCLUDED), is(false)); + assertThat(isIndexable(false, "/adoc.doc", new ArrayList<>(), DEFAULT_EXCLUDED), is(true)); + assertThat(isIndexable(false, "/mydoc~", new ArrayList<>(), DEFAULT_EXCLUDED), is(true)); } @Test public void case_sensitive() { // Excludes - assertThat(isIndexable("test.doc", new ArrayList<>(), Collections.singletonList("*.DOC")), is(false)); - assertThat(isIndexable("test.xls", new ArrayList<>(), Collections.singletonList("*.DOC")), is(true)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("*.DOC")), is(true)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Arrays.asList("*.DOC", "*.XLS")), is(false)); - assertThat(isIndexable("my.doc.xls", new ArrayList<>(), Collections.singletonList("MY.D?C*.XLS")), is(false)); - assertThat(isIndexable("my.douc.xls", new ArrayList<>(), Collections.singletonList("MY.d?C*.XLS")), is(true)); - assertThat(isIndexable(".snapshots", new ArrayList<>(), Collections.singletonList(".SNAPSHOTS")), is(false)); + assertThat(isIndexable(false, "/test.doc", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(false)); + assertThat(isIndexable(false, "/test.xls", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/*.DOC")), is(true)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Arrays.asList("*/*.DOC", "*/*.XLS")), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", new ArrayList<>(), Collections.singletonList("*/MY.D?C*.XLS")), is(false)); + assertThat(isIndexable(false, "/my.douc.xls", new ArrayList<>(), Collections.singletonList("*/MY.d?C*.XLS")), is(true)); + assertThat(isIndexable(false, "/.snapshots", new ArrayList<>(), Collections.singletonList("*/.SNAPSHOTS")), is(false)); // Includes - assertThat(isIndexable("test.doc", Collections.singletonList("*.DOC"), new ArrayList<>()), is(true)); - assertThat(isIndexable("test.xls", Collections.singletonList("*.DOC"), new ArrayList<>()), is(false)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("*.DOC"), new ArrayList<>()), is(false)); - assertThat(isIndexable("my.doc.xls", Collections.singletonList("MY.D?C*.XLS"), new ArrayList<>()), is(true)); - assertThat(isIndexable("my.douc.xls", Collections.singletonList("MY.D?C*.XLS"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/test.doc", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(true)); + assertThat(isIndexable(false, "/test.xls", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/*.DOC"), new ArrayList<>()), is(false)); + assertThat(isIndexable(false, "/my.doc.xls", Collections.singletonList("*/MY.D?C*.XLS"), new ArrayList<>()), is(true)); + assertThat(isIndexable(false, "/my.douc.xls", Collections.singletonList("*/MY.D?C*.XLS"), new ArrayList<>()), is(false)); + } + + @Test + public void directories() { + assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false)); + assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false)); + assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(false)); + assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolder*")), is(true)); + assertThat(isIndexable(true, "/folderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderA/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderB/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderB/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(false)); + assertThat(isIndexable(true, "/folderB/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/folderC/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/subfolderA", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/subfolderB", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); + assertThat(isIndexable(true, "/subfolderC", new ArrayList<>(), Collections.singletonList("/folderB/subfolderB")), is(true)); } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index 3f30d2f0b..dc631979b 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -126,7 +126,7 @@ public void testWithSimplestJsonJobFile() throws IOException { assertThat(settings.getFs(), notNullValue()); assertThat(settings.getFs().getChecksum(), nullValue()); assertThat(settings.getFs().getIncludes(), nullValue()); - assertThat(settings.getFs().getExcludes(), contains("~*")); + assertThat(settings.getFs().getExcludes(), contains("*/~*")); assertThat(settings.getFs().getIndexedChars(), nullValue()); assertThat(settings.getFs().getUpdateRate(), is(TimeValue.timeValueMinutes(15))); assertThat(settings.getFs().getUrl(), is("/tmp/es")); From 812fe7dca312a911dccb31320dcac1d50dc44783 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sat, 28 Jul 2018 10:17:48 +0200 Subject: [PATCH 2/2] Update to elasticsearch 6.3.2 Related to #569. --- docs/source/fscrawler.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/fscrawler.ini b/docs/source/fscrawler.ini index 3da937ad2..2ec1e481c 100644 --- a/docs/source/fscrawler.ini +++ b/docs/source/fscrawler.ini @@ -3,7 +3,7 @@ Version=2.5-SNAPSHOT [3rdParty] TikaVersion=1.18 -ElasticsearchVersion=6.3.1 +ElasticsearchVersion=6.3.2 LevigoVersion=2.0 TiffVersion=1.3.1 JpegVersion=1.3.0