Skip to content

Commit

Permalink
Fix the implementation
Browse files Browse the repository at this point in the history
This is now excluding the directories we don't want to include.
  • Loading branch information
dadoonet committed Jul 23, 2018
1 parent 4d5b7d3 commit 0a86ec2
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,15 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
for (FileAbstractModel child : children) {
String filename = child.name;

String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, filename).toString());

// https://github.com/dadoonet/fscrawler/issues/1 : Filter documents
boolean isIndexable = isIndexable(child.directory, filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());
boolean isIndexable = isIndexable(child.directory, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes());

logger.debug("[{}] can be indexed: [{}]", filename, isIndexable);
logger.debug("[{}] can be indexed: [{}]", virtualFileName, isIndexable);
if (isIndexable) {
if (child.file) {
logger.debug(" - file: {}", filename);
logger.debug(" - file: {}", virtualFileName);
fsFiles.add(filename);
if (child.lastModifiedDate.isAfter(lastScanDate) ||
(child.creationDate != null && child.creationDate.isAfter(lastScanDate))) {
Expand Down Expand Up @@ -293,7 +295,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD
for (String esfile : esFiles) {
logger.trace("Checking file [{}]", esfile);

if (isIndexable(false, esfile, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfile).toString());
if (isIndexable(false, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())
&& !fsFiles.contains(esfile)) {
logger.trace("Removing file [{}] in elasticsearch", esfile);
esDelete(fsSettings.getElasticsearch().getIndex(), generateIdFromFilename(esfile, filepath));
Expand All @@ -307,7 +310,8 @@ private void addFilesRecursively(FileAbstractor<?> path, String filepath, LocalD

// for the delete folder
for (String esfolder : esFolders) {
if (isIndexable(true, esfolder, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
String virtualFileName = computeVirtualPathName(stats.getRootPath(), new File(filepath, esfolder).toString());
if (isIndexable(true, virtualFileName, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes())) {
logger.trace("Checking directory [{}]", esfolder);
if (!fsFolders.contains(esfolder)) {
logger.trace("Removing recursively directory [{}] in elasticsearch", esfolder);
Expand Down
52 changes: 46 additions & 6 deletions docs/source/admin/fs/local-fs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ file system and another run. Which means that if you set it to ``15m``,
the next scan will happen on 15 minutes after the end of the current
scan, whatever its duration.

.. _includes_excludes:

Includes and excludes
^^^^^^^^^^^^^^^^^^^^^

Expand All @@ -117,21 +119,59 @@ Define ``fs.includes`` and ``fs.excludes`` properties in your
"name" : "test",
"fs": {
"includes": [
"*.doc",
"*.pdf"
"*/*.doc",
"*/*.pdf"
],
"excludes": [
"resume*"
"*/resume*"
]
}
}
It also applies to directory names. So if you want to ignore ``.ignore``
dir, just add ``.ignore`` as an excluded name. Note that ``includes``
does not apply to directory names but only to filenames.
By default, FSCrawler will exclude files starting with ``~``.

.. versionadded:: 2.5

It also applies to directory names. So if you want to ignore ``.ignore``
dir, just add ``.ignore`` as an excluded name. Note that ``includes`` and ``excludes``
apply to directory names as well.

Let's take the following example with the ``root`` dir as ``/tmp``:

.. code::
/tmp
├── folderA
│ ├── subfolderA
│ ├── subfolderB
│ └── subfolderC
├── folderB
│ ├── subfolderA
│ ├── subfolderB
│ └── subfolderC
└── folderC
├── subfolderA
├── subfolderB
└── subfolderC
If you define the following ``fs.excludes`` property in your
``~/.fscrawler/test/_settings.json`` file:

.. code:: json
{
"name" : "test",
"fs": {
"excludes": [
"/folderB/subfolder*"
]
}
}
Then all files but the ones in ``/folderB/subfolderA``, ``/folderB/subfolderB`` and
``/folderB/subfolderC`` will be indexed.

Indexing JSon docs
^^^^^^^^^^^^^^^^^^

Expand Down
133 changes: 82 additions & 51 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,55 +200,86 @@ Upgrade to 2.4
Upgrade to 2.5
~~~~~~~~~~~~~~

- A bug was causing a lot of data going over the wire each time
FSCrawler was running. To fix this issue, we changed the default
mapping and we set ``store: true`` on field ``file.filename``. If
this field is not stored and ``remove_deleted`` is ``true``
(default), FSCrawler will fail while crawling your documents. You
need to create the new mapping accordingly and reindex your existing
data either by deleting the old index and running again FSCrawler or
by using the `reindex
API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
as follows:

::

# Backup old index data
POST _reindex
{
"source": {
"index": "job_name"
},
"dest": {
"index": "job_name_backup"
}
}
# Remove job_name index
DELETE job_name

Restart FSCrawler with the following command. It will just create the
right mapping again.

.. code:: sh
$ bin/fscrawler job_name --loop 0
Then restore old data:

::

POST _reindex
{
"source": {
"index": "job_name_backup"
},
"dest": {
"index": "job_name"
}
}
# Remove backup index
DELETE job_name_backup

The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
Might be better to reindex your data.
- A bug was causing a lot of data going over the wire each time
FSCrawler was running. To fix this issue, we changed the default
mapping and we set ``store: true`` on field ``file.filename``. If
this field is not stored and ``remove_deleted`` is ``true``
(default), FSCrawler will fail while crawling your documents. You
need to create the new mapping accordingly and reindex your existing
data either by deleting the old index and running again FSCrawler or
by using the `reindex
API <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html>`__
as follows:

::

# Backup old index data
POST _reindex
{
"source": {
"index": "job_name"
},
"dest": {
"index": "job_name_backup"
}
}
# Remove job_name index
DELETE job_name

Restart FSCrawler with the following command. It will just create the
right mapping again.

.. code:: sh
$ bin/fscrawler job_name --loop 0
Then restore old data:

::

POST _reindex
{
"source": {
"index": "job_name_backup"
},
"dest": {
"index": "job_name"
}
}
# Remove backup index
DELETE job_name_backup

The default mapping changed for FSCrawler for ``meta.raw.*`` fields.
Might be better to reindex your data.

- The ``excludes`` parameter is also used for directory names. But this
new implementation also brings a breaking change if you were using ``excludes``
previously. In the previous implementation, the regular expression was only applied
to the filename. It's now applied to the full virtual path name.

For example if you have a ``/tmp`` dir as follows:

.. code::
/tmp
└── folder
├── foo.txt
└── bar.txt
Previously excluding ``foo.txt`` was excluding the virtual file ``/folder/foo.txt``.
If you still want to exclude any file named ``foo.txt`` whatever its directory
you now need to specify ``*/foo.txt``:

.. code:: json
{
"name" : "test",
"fs": {
"excludes": [
"*/foo.txt"
]
}
}
For more information, read :ref:`includes_excludes`.

Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class FsCrawlerTestIncludesIT extends AbstractFsCrawlerITCase {
@Test
public void test_includes() throws Exception {
Fs fs = startCrawlerDefinition()
.addInclude("*_include.txt")
.addInclude("*/*_include\\.txt")
.build();
startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);
countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
Expand All @@ -39,7 +39,7 @@ public void test_includes() throws Exception {
@Test
public void test_subdirs_with_patterns() throws Exception {
Fs fs = startCrawlerDefinition()
.addInclude("*.txt")
.addInclude("*/*\\.txt")
.build();
startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

Expand All @@ -50,11 +50,12 @@ public void test_subdirs_with_patterns() throws Exception {
@Test
public void test_ignore_dir() throws Exception {
Fs fs = startCrawlerDefinition()
.addExclude(".ignore")
.addExclude("*/\\.ignore")
.addExclude("/subdir/sub*")
.build();
startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

// We expect to have one file
countTestHelper(new SearchRequest(getCrawlerName()), 1L, null);
countTestHelper(new SearchRequest(getCrawlerName()), 2L, null);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This file contains some words.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This file contains some words.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This file contains some words.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static Builder builder() {
}

public static final String DEFAULT_DIR = "/tmp/es";
public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("~*");
public static final List<String> DEFAULT_EXCLUDED = Collections.singletonList("*/~*");
public static final Fs DEFAULT = Fs.builder().setUrl(DEFAULT_DIR).setExcludes(DEFAULT_EXCLUDED).build();

public static class Builder {
Expand Down

0 comments on commit 0a86ec2

Please sign in to comment.