Skip to content

Commit

Permalink
Merge pull request #1122 from dadoonet/pr/docker
Browse files Browse the repository at this point in the history
Generate FSCrawler docker images
  • Loading branch information
dadoonet authored Apr 23, 2021
2 parents 0927f27 + 691ddf1 commit d11c487
Show file tree
Hide file tree
Showing 12 changed files with 354 additions and 0 deletions.
8 changes: 8 additions & 0 deletions distribution/es6/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
<artifactId>fscrawler-es6</artifactId>
<name>FSCrawler ZIP Distribution for Elasticsearch 6.x</name>

<properties>
<module.name>es6</module.name>
</properties>

<dependencies>
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
Expand All @@ -25,6 +29,10 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
</build>

Expand Down
11 changes: 11 additions & 0 deletions distribution/es7/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
<artifactId>fscrawler-es7</artifactId>
<name>FSCrawler ZIP Distribution for Elasticsearch 7.x</name>

<properties>
<module.name>es7</module.name>

<docker.eng.tags.0>latest</docker.eng.tags.0>
<docker.eng.tags.1>${project.version}</docker.eng.tags.1>
</properties>

<dependencies>
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
Expand All @@ -25,6 +32,10 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
</build>

Expand Down
115 changes: 115 additions & 0 deletions distribution/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,55 @@
<module>es6</module>
</modules>

<properties>
<tesseract.version>4.0.0*</tesseract.version>
<tesseract.lang.version>1:4.00*</tesseract.lang.version>

<!-- Global configuration parameters for docker -->
<!-- By default, build docker images on each modules. -->
<docker.skip>${env.DOCKER_SKIP}</docker.skip>
<docker.verbose>build</docker.verbose>
<docker.username>dadoonet</docker.username>
<!-- The following assumes that each submodule will be overwritten. -->
<module.name>es7</module.name>

<!-- Docker Image Definitions -->
<!-- no install tesseract-ocr and data files for any languages -->
<docker.nolang.alias>${project.artifactId}-nolang</docker.nolang.alias>
<docker.nolang.name>${docker.username}/fscrawler:${project.version}-${module.name}-nolang</docker.nolang.name>
<docker.nolang.cacheFrom>${docker.nolang.name}</docker.nolang.cacheFrom>
<docker.nolang.dockerFile>${project.basedir}/../src/main/docker/Dockerfile</docker.nolang.dockerFile>
<docker.nolang.assembly.descriptor>${project.basedir}/../src/main/assembly/assembly.xml</docker.nolang.assembly.descriptor>
<docker.nolang.assembly.mode>tgz</docker.nolang.assembly.mode>

<!-- install tesseract-ocr and data files for English -->
<docker.eng.alias>${project.artifactId}-eng</docker.eng.alias>
<docker.eng.name>${docker.username}/fscrawler:${project.version}-${module.name}-eng</docker.eng.name>
<docker.eng.cacheFrom>${docker.eng.name}</docker.eng.cacheFrom>
<docker.eng.dockerFile>${docker.nolang.dockerFile}</docker.eng.dockerFile>
<docker.eng.assembly.descriptor>${docker.nolang.assembly.descriptor}</docker.eng.assembly.descriptor>
<docker.eng.assembly.mode>${docker.nolang.assembly.mode}</docker.eng.assembly.mode>
<docker.eng.args.langsPkg>tesseract-ocr=${tesseract.version} tesseract-ocr-eng=${tesseract.lang.version}</docker.eng.args.langsPkg>

<!-- install tesseract-ocr and data files for French -->
<docker.fra.alias>${project.artifactId}-fra</docker.fra.alias>
<docker.fra.name>${docker.username}/fscrawler:${project.version}-${module.name}-fra</docker.fra.name>
<docker.fra.cacheFrom>${docker.fra.name}</docker.fra.cacheFrom>
<docker.fra.dockerFile>${docker.nolang.dockerFile}</docker.fra.dockerFile>
<docker.fra.assembly.descriptor>${docker.nolang.assembly.descriptor}</docker.fra.assembly.descriptor>
<docker.fra.assembly.mode>${docker.nolang.assembly.mode}</docker.fra.assembly.mode>
<docker.fra.args.langsPkg>tesseract-ocr=${tesseract.version} tesseract-ocr-fra=${tesseract.lang.version}</docker.fra.args.langsPkg>

<!-- install tesseract-ocr and data files for Japanese -->
<docker.jpn.alias>${project.artifactId}-jpn</docker.jpn.alias>
<docker.jpn.name>${docker.username}/fscrawler:${project.version}-${module.name}-jpn</docker.jpn.name>
<docker.jpn.cacheFrom>${docker.jpn.name}</docker.jpn.cacheFrom>
<docker.jpn.dockerFile>${docker.nolang.dockerFile}</docker.jpn.dockerFile>
<docker.jpn.assembly.descriptor>${docker.nolang.assembly.descriptor}</docker.jpn.assembly.descriptor>
<docker.jpn.assembly.mode>${docker.nolang.assembly.mode}</docker.jpn.assembly.mode>
<docker.jpn.args.langsPkg>tesseract-ocr=${tesseract.version} tesseract-ocr-jpn=${tesseract.lang.version}</docker.jpn.args.langsPkg>
</properties>

<dependencies>
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
Expand Down Expand Up @@ -109,6 +158,72 @@
</execution>
</executions>
</plugin>
<!-- Generate the docker images (run during package step) -->
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<configuration>
<authConfig>
<push>
<username>${docker.push.username}</username>
<password>${docker.push.password}</password>
</push>
</authConfig>
<!-- Disable configuration for IT -->
<images combine.self="override">
<image>
<external>
<type>properties</type>
<prefix>docker.nolang</prefix>
</external>
</image>
<image>
<external>
<type>properties</type>
<prefix>docker.eng</prefix>
</external>
</image>
<image>
<external>
<type>properties</type>
<prefix>docker.fra</prefix>
</external>
</image>
<image>
<external>
<type>properties</type>
<prefix>docker.jpn</prefix>
</external>
</image>
</images>
</configuration>
<executions>
<execution>
<id>docker-build</id>
<phase>package</phase>
<goals>
<goal>build</goal>
</goals>
</execution>
<execution>
<id>docker-push</id>
<phase>deploy</phase>
<goals>
<goal>push</goal>
</goals>
</execution>
<execution>
<!-- There is no integration test so we can skip that execution -->
<id>start-elasticsearch</id>
<phase>none</phase>
</execution>
<execution>
<!-- There is no integration test so we can skip that execution -->
<id>stop-elasticsearch</id>
<phase>none</phase>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
Expand Down
17 changes: 17 additions & 0 deletions distribution/src/main/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM openjdk:15.0.2-jdk-slim-buster

ARG langsPkg

RUN set -ex \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
"gettext-base=0.19.*" \
${langsPkg} \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

COPY maven /usr/share/fscrawler
RUN set -ex \
&& ln -sn /usr/share/fscrawler/bin/fscrawler /usr/bin/

WORKDIR /usr/share/fscrawler
21 changes: 21 additions & 0 deletions docs/source/dev/build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,24 @@ If you want to skip the check, you can run with ``-Dossindex.fail=false``::

mvn clean install -Dossindex.fail=false

DockerHub publication
^^^^^^^^^^^^^^^^^^^^^

To publish the latest build to `DockerHub <https://hub.docker.com/r/dadoonet/fscrawler/>`_ you can manually
call ``docker:push`` maven task and provide credentials ``docker.push.username`` and ``docker.push.password``::

mvn -f distribution/pom.xml docker:push \
-Ddocker.push.username=yourdockerhubaccount \
-Ddocker.push.password=yourverysecuredpassword

Otherwise, if you call the maven ``deploy`` phase, it will be done automatically.
Note that it will still require that you provide the credentials ``docker.push.username`` and ``docker.push.password``::

mvn deploy \
-Ddocker.push.username=yourdockerhubaccount \
-Ddocker.push.password=yourverysecuredpassword

You can also provide the settings as environment variables:

* ``env.DOCKER_USERNAME`` or ``DOCKER_USERNAME``
* ``env.DOCKER_PASSWORD`` or ``DOCKER_PASSWORD``
3 changes: 3 additions & 0 deletions docs/source/dev/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ the tests while building the release.

Only developers with write rights to the sonatype repository under ``fr.pilato`` space
can perform the release.

Only developers with write rights to the `DockerHub repository <https://hub.docker.com/r/dadoonet/fscrawler/>`_
can push the Docker images.
144 changes: 144 additions & 0 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,150 @@ The distribution contains:
├── ... All needed jars


Using docker
------------

Pull the Docker image:

.. code:: sh
docker pull dadoonet/fscrawler
Let say your documents are located in ``~/tmp`` dir and you want to store your fscrawler jobs in ``~/.fscrawler``.
You can run FSCrawler with:

.. code:: sh
docker run -it --rm -v ~/.fscrawler:/root/.fscrawler -v ~/tmp:/tmp/es:ro dadoonet/fscrawler fscrawler job_name
On the first run, if the job does not exist yet in ``~/.fscrawler``, FSCrawler will ask you if you want to create it:

::

10:16:53,880 INFO [f.p.e.c.f.c.BootstrapChecks] Memory [Free/Total=Percent]: HEAP [67.3mb/876.5mb=7.69%], RAM [2.1gb/3.8gb=55.43%], Swap [1023.9mb/1023.9mb=100.0%].
10:16:53,899 WARN [f.p.e.c.f.c.FsCrawlerCli] job [job_name] does not exist
10:16:53,900 INFO [f.p.e.c.f.c.FsCrawlerCli] Do you want to create it (Y/N)?
y
10:16:56,745 INFO [f.p.e.c.f.c.FsCrawlerCli] Settings have been created in [/root/.fscrawler/job_name/_settings.yaml]. Please review and edit before relaunch

.. note::

The configuration file is actually stored on your machine in ``~/.fscrawler/job_name/_settings.yaml``.
Remember to change the URL of your elasticsearch instance as the container won't be able to see it
running under the default ``127.0.0.1``. You will need to use the actual IP address of the host.


Using docker compose
--------------------

In this section, the following directory layout is assumed:

.. code-block:: none
.
├── config
│ └── job
│ └── _settings.yaml
├── data
│ └── <your files>
└── docker-compose.yml
For example, to connect to a docker container named ``elasticsearch``, modify your ``_settings.yaml``.

.. code:: yaml
name: "test"
elasticsearch:
nodes:
- url: "http://elasticsearch:9200"
And, prepare the following ``docker-compose.yml``.

.. code:: yaml
version: '2.2'
services:
# FSCrawler
fscrawler:
image: dadoonet/fscrawler
container_name: fscrawler
volumes:
- ${PWD}/config:/root/.fscrawler
- ${PWD}/data:/tmp/es
networks:
- esnet
command: fscrawler job_name
# Elasticsearch Cluster
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.3.2
container_name: elasticsearch
environment:
- node.name=elasticsearch
- discovery.seed_hosts=elasticsearch2
- cluster.initial_master_nodes=elasticsearch,elasticsearch2
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata01:/usr/share/elasticsearch/data
ports:
- 9200:9200
networks:
- esnet
elasticsearch2:
image: docker.elastic.co/elasticsearch/elasticsearch:7.3.2
container_name: elasticsearch2
environment:
- node.name=elasticsearch2
- discovery.seed_hosts=elasticsearch
- cluster.initial_master_nodes=elasticsearch,elasticsearch2
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata02:/usr/share/elasticsearch/data
networks:
- esnet
volumes:
esdata01:
driver: local
esdata02:
driver: local
networks:
esnet:
Then, you can run Elasticsearch.

.. code:: sh
docker-compose up -d elasticsearch
docker-compose logs -f elasticsearch
Wait for elasticsearch to be started:

::



After starting Elasticsearch, you can run FSCrawler.

.. code:: sh
docker-compose up fscrawler
Running as a Service on Windows
-------------------------------

Expand Down
1 change: 1 addition & 0 deletions docs/source/user/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,4 @@ If you would like to ignore some folders to be scanned, just add a ``.fscrawleri
The folder content and all sub folders will be ignored.

For more information, read :ref:`includes_excludes`.

13 changes: 13 additions & 0 deletions docs/source/user/tips.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,16 @@ Then expose the docker container you've created by changing the IP of the REST U
url: "http://fscrawler:8080"
Pull the Docker image:

.. code:: sh
docker pull dadoonet/fscrawler
Run it:

.. code:: sh
docker run dadoonet/fscrawler job
Loading

0 comments on commit d11c487

Please sign in to comment.