From 2b4e454a112b9a86f69dcfe2232fe6e9682066d1 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 8 Feb 2022 10:37:26 +0100
Subject: [PATCH 01/78] First draft of the entrypoint

---
 src/bluesearch/entrypoint/database/parent.py |   6 +
 src/bluesearch/entrypoint/database/run.py    | 109 +++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 src/bluesearch/entrypoint/database/run.py

diff --git a/src/bluesearch/entrypoint/database/parent.py b/src/bluesearch/entrypoint/database/parent.py
index 8d392b134..fd2a01c62 100644
--- a/src/bluesearch/entrypoint/database/parent.py
+++ b/src/bluesearch/entrypoint/database/parent.py
@@ -13,6 +13,7 @@
     download,
     init,
     parse,
+    run,
     topic_extract,
     topic_filter,
 )
@@ -71,6 +72,11 @@ def main(argv: Sequence[str] | None = None) -> int:
             init_parser=parse.init_parser,
             run=parse.run,
         ),
+        "run": Cmd(
+            help="Run the pipeline.",
+            init_parser=run.init_parser,
+            run=run.run,
+        ),
         "topic-extract": Cmd(
             help="Extract topic of article(s).",
             init_parser=topic_extract.init_parser,
diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
new file mode 100644
index 000000000..0f01b3789
--- /dev/null
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -0,0 +1,109 @@
+# Blue Brain Search is a text mining toolbox focused on scientific use cases.
+#
+# Copyright (C) 2020  Blue Brain Project, EPFL.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Run the overall pipeline."""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import warnings
+from pathlib import Path
+from typing import Iterator
+
+from defusedxml import ElementTree
+
+from bluesearch.database.article import ArticleSource
+
+logger = logging.getLogger(__name__)
+
+def convert_to_datetime(s: str) -> datetime:
+    """Try to convert a string to a datetime.
+
+    Parameters
+    ----------
+    s
+        String to be check as a valid date.
+
+    Returns
+    -------
+    datetime
+        The date specified in the input string.
+
+    Raises
+    ------
+    ArgumentTypeError
+        When the specified string has not a valid date format.
+    """
+    try:
+        return datetime.strptime(s, "%Y-%m")
+    except ValueError:
+        msg = f"{s} is not a valid date"
+        raise argparse.ArgumentTypeError(msg)
+
+
+def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """Initialise the argument parser for the run subcommand.
+
+    Parameters
+    ----------
+    parser
+        The argument parser to initialise.
+
+    Returns
+    -------
+    argparse.ArgumentParser
+        The initialised argument parser. The same object as the `parser`
+        argument.
+    """
+    parser.description = "Run the overall pipeline."
+
+    parser.add_argument(
+        "source",
+        type=str,
+        choices=[member.value for member in ArticleSource],
+        help="Source of the articles.",
+    )
+    parser.add_argument(
+        "from_month",
+        type=convert_to_datetime,
+        help="The starting month (included) for the download in format YYYY-MM. "
+        "All papers from the given month until today will be downloaded.",
+    )
+    parser.add_argument(
+        "filter_config",
+        type=Path,
+        help="""
+        Path to a .JSONL file that defines all the rules for filtering.
+        """,
+    )
+    return parser
+
+
+def run(
+    *,
+    source: str,
+    from_month: datetime,
+    filter_config: Path,
+) -> int:
+    """Run overall pipeline.
+
+    Parameter description and potential defaults are documented inside of the
+    `get_parser` function.
+    """
+    logger.info("Starting the overall pipeline")
+
+    return 0

From 886c306a93f621d64470f218869c8e8be26fecec Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 8 Feb 2022 10:44:26 +0100
Subject: [PATCH 02/78] Write initial test

---
 tests/unit/entrypoint/database/test_run.py | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tests/unit/entrypoint/database/test_run.py

diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
new file mode 100644
index 000000000..06ba52a22
--- /dev/null
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -0,0 +1,49 @@
+# Blue Brain Search is a text mining toolbox focused on scientific use cases.
+#
+# Copyright (C) 2020  Blue Brain Project, EPFL.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+import datetime
+import inspect
+import pathlib
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from bluesearch.entrypoint.database import run
+from bluesearch.utils import JSONL
+
+RUN_PARAMS = {
+    "source",
+    "from_month",
+    "filter_config",
+}
+
+def test_init_parser():
+    parser = run.init_parser(argparse.ArgumentParser())
+
+    args = parser.parse_args(["arxiv", "2021-12", "/path/to/config.jsonl"])
+    assert vars(args).keys() == RUN_PARAMS
+
+    # Test the values
+    assert args.source == "arxiv"
+    assert args.from_month == datetime.datetime(2021, 12, 1)
+    assert args.filter_config == pathlib.Path("/path/to/config.jsonl")
+
+
+def test_run_arguments():
+    assert inspect.signature(run.run).parameters.keys() == RUN_PARAMS

From c9c2d4918c0c7a027a7b37cbdc3116f4c631bb94 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 8 Feb 2022 14:24:56 +0100
Subject: [PATCH 03/78] First kind of working version/sketch

---
 src/bluesearch/entrypoint/database/run.py | 108 +++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 0f01b3789..4ff667bd5 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -21,10 +21,12 @@
 import json
 import logging
 import warnings
+from datetime import datetime
 from pathlib import Path
 from typing import Iterator
 
-from defusedxml import ElementTree
+import luigi
+from luigi.util import inherits, requires
 
 from bluesearch.database.article import ArticleSource
 
@@ -93,6 +95,102 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     return parser
 
 
+FOLDER = Path.cwd() / "luigi" / "temp"
+FOLDER.mkdir(exist_ok=True, parents=True)
+
+class DownloadTask(luigi.Task):
+    source = luigi.Parameter()
+    from_month = luigi.DateParameter()
+    def requires(self):
+        pass
+
+    def run(self):
+        print(self.__class__.__name__)
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def output(self):
+        output_file = FOLDER / "download_done.txt"
+        return luigi.LocalTarget(str(output_file))
+
+
+
+
+# @inherits(DownloadTask)
+@requires(DownloadTask)
+class TopicExtractTask(luigi.Task):
+    source = luigi.Parameter()
+
+    def run(self):
+        print(self.__class__.__name__)
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def output(self):
+        output_file = FOLDER / "extraction_done.txt"
+
+        return luigi.LocalTarget(str(output_file))
+
+# @inherits(TopicExtractTask)
+@requires(TopicExtractTask)
+class TopicFilterTask(luigi.Task):
+    filter_config = luigi.Parameter()
+
+    def run(self):
+        print(self.__class__.__name__)
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def output(self):
+        output_file = FOLDER / "filtering_done.txt"
+
+        return luigi.LocalTarget(str(output_file))
+
+@requires(TopicFilterTask)
+class ConvertPDFTask(luigi.Task):
+    def run(self):
+        print(self.__class__.__name__)
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def output(self):
+        output_file = FOLDER / "converting_pdf_done.txt"
+
+        return luigi.LocalTarget(str(output_file))
+
+
+@inherits(ConvertPDFTask, TopicFilterTask)
+# @requires(TopicFilterTask)
+class ParseTask(luigi.Task):
+    def run(self):
+        print(self.__class__.__name__)
+
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def requires(self):
+        if self.source == "arxiv":
+            return self.clone(ConvertPDFTask)
+        else:
+            return self.clone(TopicFilterTask)
+
+    def output(self):
+        output_file = FOLDER / "parsing_done.txt"
+
+        return luigi.LocalTarget(str(output_file))
+
+@requires(ParseTask)
+class AddTask(luigi.Task):
+    def run(self):
+        print(self.__class__.__name__)
+        output_file = Path(self.output().path)
+        output_file.touch()
+
+    def output(self):
+        output_file = FOLDER / "adding_done.txt"
+
+        return luigi.LocalTarget(str(output_file))
+
 def run(
     *,
     source: str,
@@ -106,4 +204,12 @@ def run(
     """
     logger.info("Starting the overall pipeline")
 
+
+    luigi.build(
+        [
+            AddTask(source=source, from_month=from_month, filter_config=filter_config)
+        ],
+        log_level="CRITICAL"
+    )
+
     return 0

From 4e561b7d14aa22839842cb31353b3495451410d4 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 8 Feb 2022 15:22:11 +0100
Subject: [PATCH 04/78] Make download task work

---
 src/bluesearch/entrypoint/database/run.py | 95 +++++++++++++----------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 4ff667bd5..7eee4ebc3 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -27,35 +27,12 @@
 
 import luigi
 from luigi.util import inherits, requires
+from luigi.contrib.external_program import ExternalProgramTask
 
 from bluesearch.database.article import ArticleSource
 
 logger = logging.getLogger(__name__)
 
-def convert_to_datetime(s: str) -> datetime:
-    """Try to convert a string to a datetime.
-
-    Parameters
-    ----------
-    s
-        String to be check as a valid date.
-
-    Returns
-    -------
-    datetime
-        The date specified in the input string.
-
-    Raises
-    ------
-    ArgumentTypeError
-        When the specified string has not a valid date format.
-    """
-    try:
-        return datetime.strptime(s, "%Y-%m")
-    except ValueError:
-        msg = f"{s} is not a valid date"
-        raise argparse.ArgumentTypeError(msg)
-
 
 def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     """Initialise the argument parser for the run subcommand.
@@ -81,7 +58,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     )
     parser.add_argument(
         "from_month",
-        type=convert_to_datetime,
+        type=str,
         help="The starting month (included) for the download in format YYYY-MM. "
         "All papers from the given month until today will be downloaded.",
     )
@@ -92,28 +69,45 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         Path to a .JSONL file that defines all the rules for filtering.
         """,
     )
+    parser.add_argument(
+        "output_dir",
+        type=Path,
+        help="""
+        Path to the output folder. All the results stored under
+        `output_dir/source/date` where date is concatenation of the
+        `from_month` and the day of execution of this command.
+        """,
+    )
     return parser
 
 
 FOLDER = Path.cwd() / "luigi" / "temp"
 FOLDER.mkdir(exist_ok=True, parents=True)
 
-class DownloadTask(luigi.Task):
+BBS_BINARY = "bbs_database"
+
+class DownloadTask(ExternalProgramTask):
     source = luigi.Parameter()
-    from_month = luigi.DateParameter()
-    def requires(self):
-        pass
+    from_month = luigi.Parameter()
+    output_dir = luigi.Parameter()
 
-    def run(self):
-        print(self.__class__.__name__)
-        output_file = Path(self.output().path)
-        output_file.touch()
+    capture_output=False
 
     def output(self):
-        output_file = FOLDER / "download_done.txt"
-        return luigi.LocalTarget(str(output_file))
+        today = datetime.today()
+        date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
+
+        output_dir = Path(self.output_dir) / self.source / date / "raw"
+
+        return luigi.LocalTarget(str(output_dir))
 
 
+    def program_args(self):
+        output_dir = self.output().path
+        return [
+            BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
+        ]
+
 
 
 # @inherits(DownloadTask)
@@ -127,7 +121,7 @@ def run(self):
         output_file.touch()
 
     def output(self):
-        output_file = FOLDER / "extraction_done.txt"
+        output_file = Path(self.input().path).parent / "extraction_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -142,7 +136,7 @@ def run(self):
         output_file.touch()
 
     def output(self):
-        output_file = FOLDER / "filtering_done.txt"
+        output_file = Path(self.input().path).parent / "filtering_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -154,7 +148,7 @@ def run(self):
         output_file.touch()
 
     def output(self):
-        output_file = FOLDER / "converting_pdf_done.txt"
+        output_file = Path(self.input().path).parent / "converting_pdf_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -175,7 +169,7 @@ def requires(self):
             return self.clone(TopicFilterTask)
 
     def output(self):
-        output_file = FOLDER / "parsing_done.txt"
+        output_file = Path(self.input().path).parent / "parsing_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -187,15 +181,23 @@ def run(self):
         output_file.touch()
 
     def output(self):
-        output_file = FOLDER / "adding_done.txt"
+        output_file = Path(self.input().path).parent / "adding_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
+@requires(AddTask)
+class ListTask(ExternalProgramTask):
+    capture_output = False
+    def program_args(self):
+        return ["ls", "-alh", "luigi/temp/"]
+
+
 def run(
     *,
     source: str,
-    from_month: datetime,
+    from_month: str,
     filter_config: Path,
+    output_dir: Path,
 ) -> int:
     """Run overall pipeline.
 
@@ -207,9 +209,16 @@ def run(
 
     luigi.build(
         [
-            AddTask(source=source, from_month=from_month, filter_config=filter_config)
+            AddTask(
+                source=source,
+                from_month=from_month,
+                filter_config=str(filter_config),
+                output_dir=str(output_dir),
+            )
+            # ListTask(source=source, from_month=from_month, filter_config=filter_config)
         ],
-        log_level="CRITICAL"
+        log_level="INFO",
+        # log_level="INFO"
     )
 
     return 0

From 4ae5fa9b111e32570f2a08691cb21a586d6e08e9 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Wed, 9 Feb 2022 21:04:12 +0100
Subject: [PATCH 05/78] Implement unzipping logic

---
 src/bluesearch/entrypoint/database/run.py | 109 ++++++++++++++++++++--
 1 file changed, 102 insertions(+), 7 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 7eee4ebc3..972a01f6f 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -18,8 +18,11 @@
 from __future__ import annotations
 
 import argparse
+import gzip
 import json
 import logging
+import shutil
+import tarfile
 import warnings
 from datetime import datetime
 from pathlib import Path
@@ -78,20 +81,50 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         `from_month` and the day of execution of this command.
         """,
     )
-    return parser
+    parser.add_argument(
+        "db_url",
+        type=str,
+        help="""
+        The location of the database depending on the database type.
 
+        For MySQL and MariaDB the server URL should be provided, for SQLite the
+        location of the database file. Generally, the scheme part of
+        the URL should be omitted, e.g. for MySQL the URL should be
+        of the form 'my_sql_server.ch:1234/my_database' and for SQLite
+        of the form '/path/to/the/local/database.db'.
+        """,
+    )
+    parser.add_argument(
+        "--db-type",
+        default="sqlite",
+        type=str,
+        choices=("mariadb", "mysql", "postgres", "sqlite"),
+        help="Type of the database.",
+    )
+    parser.add_argument(
+        "--mesh-topic-db",
+        type=Path,
+        help="""
+        The JSON file with MeSH topic hierarchy information. Mandatory for
+        source types "pmc" and "pubmed".
+
+        The JSON file should contain a flat dictionary with MeSH topic tree
+        numbers mapped to the corresponding topic labels. This file can be
+        produced using the `bbs_database parse-mesh-rdf` command. See that
+        command's description for more details.
+        """,
+    )
+    return parser
 
-FOLDER = Path.cwd() / "luigi" / "temp"
-FOLDER.mkdir(exist_ok=True, parents=True)
 
 BBS_BINARY = "bbs_database"
+CAPTURE_OUTPUT = False
 
 class DownloadTask(ExternalProgramTask):
     source = luigi.Parameter()
     from_month = luigi.Parameter()
     output_dir = luigi.Parameter()
 
-    capture_output=False
 
     def output(self):
         today = datetime.today()
@@ -108,10 +141,58 @@ def program_args(self):
             BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
         ]
 
+@requires(DownloadTask)
+class UnzipTask(ExternalProgramTask):
+    """Needs to support unziping of both pubmed and pmc."""
+    source = luigi.Parameter()
 
 
-# @inherits(DownloadTask)
-@requires(DownloadTask)
+    def output(self):
+        input_path = Path(self.input().path)
+        output_dir = input_path.parent / "raw_unzipped"
+
+        return luigi.LocalTarget(str(output_dir))
+
+    def run(self):
+        input_dir =  Path(self.input().path) # raw
+        output_dir = Path(self.output().path)  # raw_unzipped
+
+        
+        output_dir.mkdir(exist_ok=True, parents=True)
+        if self.source == "pmc":
+            # .tar.gz
+            # We want collapse the folder hierarchy
+            all_tar_files = input_dir.rglob("*.tar.gz")
+            for archive in all_tar_files:
+                output_path = output_dir / archive.stem
+                my_tar = tarfile.open(archive)
+                all_articles = [x for x in my_tar.getmembers() if x.isfile()]
+                for article in all_articles:
+                    output_path = output_dir / article.path.rpartition("/")[2]
+                    f_in = my_tar.extractfile(article)
+                    with open(output_path, "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                my_tar.close()
+
+        elif self.source == "pubmed":
+            # .xml.gz
+            all_zip_files = [p for p in input_dir.iterdir() if p.suffix == ".gz"]
+            if not all_zip_files:
+                raise ValueError("No zip files were found")
+
+            for archive in all_zip_files:
+                output_path = output_dir / archive.stem
+                with gzip.open(archive, "rb") as f_in:
+                    with open(output_path,"wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+
+        else:
+            raise ValueError(f"Unsupported source {self.source}")
+
+
+
+
+@requires(DownloadTask, UnzipTask)
 class TopicExtractTask(luigi.Task):
     source = luigi.Parameter()
 
@@ -120,8 +201,15 @@ def run(self):
         output_file = Path(self.output().path)
         output_file.touch()
 
+    def requires(self):
+        if self.source in {"pmc", "pubmed"}:
+            return self.clone(UnzipTask)
+        else:
+            return self.clone(DownloadTask)
+
     def output(self):
-        output_file = Path(self.input().path).parent / "extraction_done.txt"
+        input_dir = self.input()[0]
+        output_file = Path(input_dir.path).parent / "extraction_done.txt"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -198,6 +286,9 @@ def run(
     from_month: str,
     filter_config: Path,
     output_dir: Path,
+    db_url: str,
+    db_type: str,
+    mesh_topic_db: Path
 ) -> int:
     """Run overall pipeline.
 
@@ -206,6 +297,8 @@ def run(
     """
     logger.info("Starting the overall pipeline")
 
+    DownloadTask.capture_output = CAPTURE_OUTPUT 
+    TopicExtractTask.capture_output = CAPTURE_OUTPUT 
 
     luigi.build(
         [
@@ -218,6 +311,8 @@ def run(
             # ListTask(source=source, from_month=from_month, filter_config=filter_config)
         ],
         log_level="INFO",
+        # workers=0,
+        local_scheduler=True,  # prevents the task already in progress errors
         # log_level="INFO"
     )
 

From daeb4d748746b61fadd086d6d9c0b9a0cd04aae2 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 12:38:29 +0100
Subject: [PATCH 06/78] Implement dry run

---
 src/bluesearch/entrypoint/database/run.py | 47 ++++++++++++++---------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 972a01f6f..7b82821b0 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -31,6 +31,7 @@
 import luigi
 from luigi.util import inherits, requires
 from luigi.contrib.external_program import ExternalProgramTask
+from luigi.tools.deps_tree import print_tree
 
 from bluesearch.database.article import ArticleSource
 
@@ -114,6 +115,13 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         command's description for more details.
         """,
     )
+    parser.add_argument(
+        "--dry-run",
+        "-n",
+        action="store_true",
+        help="Prints out a diagram of the pipeline without running it.",
+    )
+
     return parser
 
 
@@ -192,7 +200,7 @@ def run(self):
 
 
 
-@requires(DownloadTask, UnzipTask)
+@inherits(DownloadTask, UnzipTask)
 class TopicExtractTask(luigi.Task):
     source = luigi.Parameter()
 
@@ -208,8 +216,8 @@ def requires(self):
             return self.clone(DownloadTask)
 
     def output(self):
-        input_dir = self.input()[0]
-        output_file = Path(input_dir.path).parent / "extraction_done.txt"
+        input_dir = self.input()
+        output_file = Path(input_dir.path).parent / "topic_infos.jsonl"
 
         return luigi.LocalTarget(str(output_file))
 
@@ -288,7 +296,8 @@ def run(
     output_dir: Path,
     db_url: str,
     db_type: str,
-    mesh_topic_db: Path
+    mesh_topic_db: Path,
+    dry_run: bool
 ) -> int:
     """Run overall pipeline.
 
@@ -300,20 +309,22 @@ def run(
     DownloadTask.capture_output = CAPTURE_OUTPUT 
     TopicExtractTask.capture_output = CAPTURE_OUTPUT 
 
-    luigi.build(
-        [
-            AddTask(
-                source=source,
-                from_month=from_month,
-                filter_config=str(filter_config),
-                output_dir=str(output_dir),
-            )
-            # ListTask(source=source, from_month=from_month, filter_config=filter_config)
-        ],
-        log_level="INFO",
-        # workers=0,
-        local_scheduler=True,  # prevents the task already in progress errors
-        # log_level="INFO"
+    final_task = AddTask(
+        source=source,
+        from_month=from_month,
+        filter_config=str(filter_config),
+        output_dir=str(output_dir),
     )
 
+    luigi_kwargs = {
+        "tasks": [final_task],
+        "log_level": "DEBUG",
+        "local_scheduler": True,
+    }
+    if dry_run:
+        print(print_tree(final_task, last=False))
+    else:
+
+        luigi.build(**luigi_kwargs)
+
     return 0

From 621c6bb6a06f3494fe13f9bbbc0f224639f6c9b6 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 12:48:39 +0100
Subject: [PATCH 07/78] Turn positionals into required options

Should improve readability
---
 src/bluesearch/entrypoint/database/run.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 7b82821b0..cfd7f66f4 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -55,26 +55,30 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.description = "Run the overall pipeline."
 
     parser.add_argument(
-        "source",
+        "--source",
+        required=True,
         type=str,
         choices=[member.value for member in ArticleSource],
         help="Source of the articles.",
     )
     parser.add_argument(
-        "from_month",
+        "--from-month",
+        required=True,
         type=str,
         help="The starting month (included) for the download in format YYYY-MM. "
         "All papers from the given month until today will be downloaded.",
     )
     parser.add_argument(
-        "filter_config",
+        "--filter-config",
+        required=True,
         type=Path,
         help="""
         Path to a .JSONL file that defines all the rules for filtering.
         """,
     )
     parser.add_argument(
-        "output_dir",
+        "--output-dir",
+        required=True,
         type=Path,
         help="""
         Path to the output folder. All the results stored under
@@ -83,7 +87,8 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """,
     )
     parser.add_argument(
-        "db_url",
+        "--db-url",
+        required=True,
         type=str,
         help="""
         The location of the database depending on the database type.

From d3f97ac05a5fd1c244a0e3396a435393ada73861 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 13:46:26 +0100
Subject: [PATCH 08/78] Implement TopicExtractTask

---
 src/bluesearch/entrypoint/database/run.py | 26 +++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index cfd7f66f4..2107419c2 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -154,6 +154,8 @@ def program_args(self):
             BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
         ]
 
+
+
 @requires(DownloadTask)
 class UnzipTask(ExternalProgramTask):
     """Needs to support unziping of both pubmed and pmc."""
@@ -206,13 +208,9 @@ def run(self):
 
 
 @inherits(DownloadTask, UnzipTask)
-class TopicExtractTask(luigi.Task):
+class TopicExtractTask(ExternalProgramTask):
     source = luigi.Parameter()
-
-    def run(self):
-        print(self.__class__.__name__)
-        output_file = Path(self.output().path)
-        output_file.touch()
+    mesh_topic_db = luigi.Parameter()
 
     def requires(self):
         if self.source in {"pmc", "pubmed"}:
@@ -226,6 +224,21 @@ def output(self):
 
         return luigi.LocalTarget(str(output_file))
 
+
+    def program_args(self):
+        input_dir = self.input().path
+        output_dir = self.output().path
+
+        command = [
+            BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, 
+        ]
+ 
+        if self.source in {"pmc", "pubmed"}:
+            command.append(f"--mesh-topic-db={self.mesh_topic_db}")
+
+        return command
+
+
 # @inherits(TopicExtractTask)
 @requires(TopicExtractTask)
 class TopicFilterTask(luigi.Task):
@@ -319,6 +332,7 @@ def run(
         from_month=from_month,
         filter_config=str(filter_config),
         output_dir=str(output_dir),
+        mesh_topic_db=str(mesh_topic_db),
     )
 
     luigi_kwargs = {

From a4710eb7155d6224595dcc1b78df8885a7f61990 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 14:02:46 +0100
Subject: [PATCH 09/78] Implement topicfiltertask

---
 src/bluesearch/entrypoint/database/run.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 2107419c2..6dda94e61 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -239,21 +239,25 @@ def program_args(self):
         return command
 
 
-# @inherits(TopicExtractTask)
 @requires(TopicExtractTask)
-class TopicFilterTask(luigi.Task):
+class TopicFilterTask(ExternalProgramTask):
     filter_config = luigi.Parameter()
 
-    def run(self):
-        print(self.__class__.__name__)
-        output_file = Path(self.output().path)
-        output_file.touch()
-
     def output(self):
-        output_file = Path(self.input().path).parent / "filtering_done.txt"
+        output_file = Path(self.input().path).parent / "filtering.csv"
 
         return luigi.LocalTarget(str(output_file))
 
+    def program_args(self):
+        extracted_topics = self.input().path
+        output_file = self.output().path
+
+        command = [
+            BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, 
+        ]
+ 
+        return command
+
 @requires(TopicFilterTask)
 class ConvertPDFTask(luigi.Task):
     def run(self):

From a1b0e86bbe2c164fc29c3289ef18ec69f6e76174 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 14:58:33 +0100
Subject: [PATCH 10/78] Add create symlinks task

---
 src/bluesearch/entrypoint/database/run.py | 80 ++++++++++++++++++++---
 1 file changed, 71 insertions(+), 9 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 6dda94e61..c3217efc0 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -29,6 +29,7 @@
 from typing import Iterator
 
 import luigi
+import pandas as pd
 from luigi.util import inherits, requires
 from luigi.contrib.external_program import ExternalProgramTask
 from luigi.tools.deps_tree import print_tree
@@ -126,6 +127,16 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         action="store_true",
         help="Prints out a diagram of the pipeline without running it.",
     )
+    parser.add_argument(
+        "--grobid-host",
+        type=str,
+        help="The host of the GROBID server.",
+    )
+    parser.add_argument(
+        "--grobid-port",
+        type=int,
+        help="The port of the GROBID server.",
+    )
 
     return parser
 
@@ -258,21 +269,68 @@ def program_args(self):
  
         return command
 
+
 @requires(TopicFilterTask)
-class ConvertPDFTask(luigi.Task):
+class CreateSymlinksTask(luigi.Task):
+    def output(self):
+        output_dir = Path(self.input().path).parent / "filtered"
+
+        return luigi.LocalTarget(str(output_dir))
+
     def run(self):
-        print(self.__class__.__name__)
-        output_file = Path(self.output().path)
-        output_file.touch()
+        output_dir = Path(self.output().path)
+        filtering_path = Path(self.input().path)
+        input_dir = output_dir.parent / "raw_unzipped" 
+
+        if (output_dir.parent / "raw_unzipped").exists():
+            input_dir = output_dir.parent / "raw_unzipped"
+        else:
+            input_dir = output_dir.parent / "raw"
+
+        filtering = pd.read_csv(filtering_path)
+        accepted = filtering[filtering.accept].path
+
+        def create_symlink(path):
+            input_path = Path(path)
+            output_path = output_dir / input_path.name
+            output_path.symlink_to(input_path)
+
+        output_dir.mkdir(exist_ok=True)
+
+        accepted.apply(create_symlink)
+
+
+
+
+@requires(CreateSymlinksTask)
+class ConvertPDFTask(ExternalProgramTask):
+    grobid_host = luigi.Parameter()
+    grobid_port = luigi.Parameter()
+
+
+    def program_args(self):
+        input_dir = Path(self.input().path).parent / "raw"
+        output_dir = self.output().path
+
+        command = [
+            BBS_BINARY,
+            "convert-pdf",
+            "-v",
+            self.grobid_host,
+            self.grobid_port, 
+            input_dir,
+            f"--output_dir={output_dir}",
+        ]
+ 
+        return command
 
     def output(self):
-        output_file = Path(self.input().path).parent / "converting_pdf_done.txt"
+        output_file = Path(self.input().path).parent / "converted_pdfs"
 
         return luigi.LocalTarget(str(output_file))
 
 
-@inherits(ConvertPDFTask, TopicFilterTask)
-# @requires(TopicFilterTask)
+@inherits(ConvertPDFTask, CreateSymlinksTask)
 class ParseTask(luigi.Task):
     def run(self):
         print(self.__class__.__name__)
@@ -318,8 +376,10 @@ def run(
     output_dir: Path,
     db_url: str,
     db_type: str,
-    mesh_topic_db: Path,
-    dry_run: bool
+    mesh_topic_db: Path | None,
+    dry_run: bool,
+    grobid_host: str | None,
+    grobid_port: int | None,
 ) -> int:
     """Run overall pipeline.
 
@@ -337,6 +397,8 @@ def run(
         filter_config=str(filter_config),
         output_dir=str(output_dir),
         mesh_topic_db=str(mesh_topic_db),
+        grobid_host=grobid_host,
+        grobid_port=grobid_port,
     )
 
     luigi_kwargs = {

From 2d7c068ed6b844da105082479dbb75e2e3279586 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 15:25:39 +0100
Subject: [PATCH 11/78] Implement convertpdf task

---
 src/bluesearch/entrypoint/database/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index c3217efc0..a4dfc5c6e 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -309,7 +309,7 @@ class ConvertPDFTask(ExternalProgramTask):
 
 
     def program_args(self):
-        input_dir = Path(self.input().path).parent / "raw"
+        input_dir = Path(self.input().path).parent / "filtered"
         output_dir = self.output().path
 
         command = [
@@ -319,7 +319,7 @@ def program_args(self):
             self.grobid_host,
             self.grobid_port, 
             input_dir,
-            f"--output_dir={output_dir}",
+            f"--output-dir={output_dir}",
         ]
  
         return command

From 2a37dea7cc494fa8ee23ef442db3fffe8a01ba5b Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 15:39:24 +0100
Subject: [PATCH 12/78] Implement parse task

---
 src/bluesearch/entrypoint/database/run.py | 41 ++++++++++++++++++-----
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index a4dfc5c6e..1b0161722 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -331,13 +331,7 @@ def output(self):
 
 
 @inherits(ConvertPDFTask, CreateSymlinksTask)
-class ParseTask(luigi.Task):
-    def run(self):
-        print(self.__class__.__name__)
-
-        output_file = Path(self.output().path)
-        output_file.touch()
-
+class ParseTask(ExternalProgramTask):
     def requires(self):
         if self.source == "arxiv":
             return self.clone(ConvertPDFTask)
@@ -345,10 +339,41 @@ def requires(self):
             return self.clone(TopicFilterTask)
 
     def output(self):
-        output_file = Path(self.input().path).parent / "parsing_done.txt"
+        output_file = Path(self.input().path).parent / "parsed"
 
         return luigi.LocalTarget(str(output_file))
 
+    def program_args(self):
+        output_dir = Path(self.output().path)
+        output_dir.mkdir(exist_ok=True)
+
+
+        if (output_dir.parent / "converted_pdfs").exists():
+            input_dir = output_dir.parent / "converted_pdfs"
+        else:
+            input_dir = output_dir.parent / "filtered"
+
+        # Determine parser
+        source2parser = {
+            "arxiv": "tei-xml-arxiv",
+            "biorxiv": "jatx-xml",
+            "medrxiv": "jatx-xml",
+            "pmc": "jatx-xml",
+            "pubmed": "pubmed-xml",
+        }
+        parser = source2parser[self.source]
+
+        command = [
+            BBS_BINARY,
+            "parse",
+            "-v",
+            parser,
+            input_dir, 
+            output_dir,
+        ]
+ 
+        return command
+
 @requires(ParseTask)
 class AddTask(luigi.Task):
     def run(self):

From e34439b1c0944f0bda6311878a482956fad4fa0d Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 10 Feb 2022 16:38:45 +0100
Subject: [PATCH 13/78] Implement AddTask

---
 src/bluesearch/entrypoint/database/run.py | 55 +++++++++++++++++++----
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 1b0161722..451e6dc73 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -30,6 +30,7 @@
 
 import luigi
 import pandas as pd
+import sqlalchemy
 from luigi.util import inherits, requires
 from luigi.contrib.external_program import ExternalProgramTask
 from luigi.tools.deps_tree import print_tree
@@ -374,17 +375,53 @@ def program_args(self):
  
         return command
 
+
 @requires(ParseTask)
-class AddTask(luigi.Task):
-    def run(self):
-        print(self.__class__.__name__)
-        output_file = Path(self.output().path)
-        output_file.touch()
+class AddTask(ExternalProgramTask):
+    db_url = luigi.Parameter()
+    db_type = luigi.Parameter()
+
+    def complete(self):
+        # If all the articles are inside
+        if self.db_type == "sqlite":
+            prefix = "sqlite:///"
+        elif self.db_type == "postgres":
+            prefix = "postgresql+pg8000://"
+        else:
+            raise ValueError
+
+        engine = sqlalchemy.create_engine(f"{prefix}{self.db_url}")
+
+        input_dir = Path(self.input().path)
+        all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"]
+
+        new_uids = []
+        for uid in all_uids:
+            query = "SELECT article_id from articles WHERE article_id = ?"
+            res = engine.execute(query, (uid,)).fetchall()
+
+            if not res:
+                new_uids.append(uid)
+
+        return not new_uids
+
+
+    def program_args(self):
+        input_dir = Path(self.input().path)
+
+
+        command = [
+            BBS_BINARY,
+            "add",
+            self.db_url,
+            input_dir,
+            "-v",
+            f"--db-type={self.db_type}",
+        ]
+ 
+        return command
 
-    def output(self):
-        output_file = Path(self.input().path).parent / "adding_done.txt"
 
-        return luigi.LocalTarget(str(output_file))
 
 @requires(AddTask)
 class ListTask(ExternalProgramTask):
@@ -424,6 +461,8 @@ def run(
         mesh_topic_db=str(mesh_topic_db),
         grobid_host=grobid_host,
         grobid_port=grobid_port,
+        db_url=db_url,
+        db_type=db_type,
     )
 
     luigi_kwargs = {

From 06df235631a5be8833dbb79fa9f13473f1a442f9 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 10:18:54 +0100
Subject: [PATCH 14/78] Improve logic in custom compleete

---
 src/bluesearch/entrypoint/database/run.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 451e6dc73..6e47df0a4 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -393,6 +393,9 @@ def complete(self):
         engine = sqlalchemy.create_engine(f"{prefix}{self.db_url}")
 
         input_dir = Path(self.input().path)
+        if not input_dir.exists():
+            return False
+
         all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"]
 
         new_uids = []

From 973b284b246a32af99165f110fcf6b28153bbe4a Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 10:36:31 +0100
Subject: [PATCH 15/78] Handle keyboardinterrupt in topic-extract

---
 .../entrypoint/database/topic_extract.py      | 105 +++++++++---------
 1 file changed, 54 insertions(+), 51 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index 5e5ce9025..4ef6d26e3 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -164,62 +164,65 @@ def run(
 
     article_source = ArticleSource(source)
     all_results: list[dict[str, Any]] = []
-    if article_source is ArticleSource.PMC:
-        if mesh_topic_db is None:
-            logger.error("The option --mesh-topics-db is mandatory for source type pmc")
-            return 1
-        mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
-        for path in inputs:
-            logger.info(f"Processing {path}")
-            topic_info = TopicInfo(source=article_source, path=path.resolve())
-            journal_topics = get_topics_for_pmc_article(path)
-            if journal_topics:
-                topic_info.add_journal_topics(
-                    "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
-                )
-            all_results.append(topic_info.json())
-    elif article_source is ArticleSource.PUBMED:
-        if mesh_topic_db is None:
-            logger.error(
-                "The option --mesh-topics-db is mandatory for source type pubmed"
-            )
-            return 1
-        mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
-        for path in inputs:
-            logger.info(f"Processing {path}")
-            articles = ElementTree.parse(input_path)
-            for i, article in enumerate(articles.iter("PubmedArticle")):
-                topic_info = TopicInfo(
-                    source=article_source,
-                    path=path.resolve(),
-                    element_in_file=i,
-                )
-                article_topics = extract_article_topics_for_pubmed_article(article)
-                journal_topics = extract_journal_topics_for_pubmed_article(article)
-                if article_topics:
-                    topic_info.add_article_topics(
-                        "MeSH", mesh.resolve_parents(article_topics, mesh_tree)
-                    )
+    try:
+        if article_source is ArticleSource.PMC:
+            if mesh_topic_db is None:
+                logger.error("The option --mesh-topics-db is mandatory for source type pmc")
+                return 1
+            mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
+            for path in inputs:
+                logger.info(f"Processing {path}")
+                topic_info = TopicInfo(source=article_source, path=path.resolve())
+                journal_topics = get_topics_for_pmc_article(path)
                 if journal_topics:
                     topic_info.add_journal_topics(
                         "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
                     )
                 all_results.append(topic_info.json())
-    elif article_source is ArticleSource.ARXIV:
-        for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
-            topic_info = TopicInfo(source=article_source, path=path)
-            topic_info.add_article_topics("arXiv", article_topics)
-            all_results.append(topic_info.json())
-    elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
-        for path in inputs:
-            logger.info(f"Processing {path}")
-            topic, journal = extract_article_topics_from_medrxiv_article(path)
-            topic_info = TopicInfo(source=ArticleSource(journal), path=path)
-            topic_info.add_article_topics("Subject Area", [topic])
-            all_results.append(topic_info.json())
-    else:
-        logger.error(f"The source type {source!r} is not implemented yet")
-        return 1
+        elif article_source is ArticleSource.PUBMED:
+            if mesh_topic_db is None:
+                logger.error(
+                    "The option --mesh-topics-db is mandatory for source type pubmed"
+                )
+                return 1
+            mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
+            for path in inputs:
+                logger.info(f"Processing {path}")
+                articles = ElementTree.parse(input_path)
+                for i, article in enumerate(articles.iter("PubmedArticle")):
+                    topic_info = TopicInfo(
+                        source=article_source,
+                        path=path.resolve(),
+                        element_in_file=i,
+                    )
+                    article_topics = extract_article_topics_for_pubmed_article(article)
+                    journal_topics = extract_journal_topics_for_pubmed_article(article)
+                    if article_topics:
+                        topic_info.add_article_topics(
+                            "MeSH", mesh.resolve_parents(article_topics, mesh_tree)
+                        )
+                    if journal_topics:
+                        topic_info.add_journal_topics(
+                            "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
+                        )
+                    all_results.append(topic_info.json())
+        elif article_source is ArticleSource.ARXIV:
+            for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
+                topic_info = TopicInfo(source=article_source, path=path)
+                topic_info.add_article_topics("arXiv", article_topics)
+                all_results.append(topic_info.json())
+        elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
+            for path in inputs:
+                logger.info(f"Processing {path}")
+                topic, journal = extract_article_topics_from_medrxiv_article(path)
+                topic_info = TopicInfo(source=ArticleSource(journal), path=path)
+                topic_info.add_article_topics("Subject Area", [topic])
+                all_results.append(topic_info.json())
+        else:
+            logger.error(f"The source type {source!r} is not implemented yet")
+            return 1
+    except KeyboardInterrupt:
+        pass
 
     JSONL.dump_jsonl(all_results, output_file, overwrite)
 

From 4da3450e61422d09ed304c0bec0f931bcf45e793 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 11:05:57 +0100
Subject: [PATCH 16/78] Timeout experiments

Very unsuccessful
---
 src/bluesearch/entrypoint/database/run.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 6e47df0a4..24299eef0 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -142,7 +142,8 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     return parser
 
 
-BBS_BINARY = "bbs_database"
+BBS_BINARY = ["gtimeout", "--preserve-status", "5" , "bbs_database"]
+BBS_BINARY = ["bbs_database"]
 CAPTURE_OUTPUT = False
 
 class DownloadTask(ExternalProgramTask):
@@ -163,7 +164,7 @@ def output(self):
     def program_args(self):
         output_dir = self.output().path
         return [
-            BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
+            *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
         ]
 
 
@@ -242,7 +243,7 @@ def program_args(self):
         output_dir = self.output().path
 
         command = [
-            BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, 
+            *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, 
         ]
  
         if self.source in {"pmc", "pubmed"}:
@@ -265,7 +266,7 @@ def program_args(self):
         output_file = self.output().path
 
         command = [
-            BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, 
+            *BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, 
         ]
  
         return command
@@ -314,7 +315,7 @@ def program_args(self):
         output_dir = self.output().path
 
         command = [
-            BBS_BINARY,
+            *BBS_BINARY,
             "convert-pdf",
             "-v",
             self.grobid_host,
@@ -365,7 +366,7 @@ def program_args(self):
         parser = source2parser[self.source]
 
         command = [
-            BBS_BINARY,
+            *BBS_BINARY,
             "parse",
             "-v",
             parser,
@@ -425,13 +426,8 @@ def program_args(self):
         return command
 
 
-
-@requires(AddTask)
-class ListTask(ExternalProgramTask):
-    capture_output = False
-    def program_args(self):
-        return ["ls", "-alh", "luigi/temp/"]
-
+class worker(luigi.Config):
+    timeout = luigi.IntParameter(5)
 
 def run(
     *,
@@ -468,6 +464,7 @@ def run(
         db_type=db_type,
     )
 
+
     luigi_kwargs = {
         "tasks": [final_task],
         "log_level": "DEBUG",

From 8ffbb61773e1a6e2e7452847abd15d144f50086b Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 11:51:37 +0100
Subject: [PATCH 17/78] Remove keyboardinterrupt catching

---
 src/bluesearch/entrypoint/database/run.py     |   2 +-
 .../entrypoint/database/topic_extract.py      | 105 +++++++++---------
 2 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 24299eef0..59669e67c 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -415,7 +415,7 @@ def program_args(self):
 
 
         command = [
-            BBS_BINARY,
+            *BBS_BINARY,
             "add",
             self.db_url,
             input_dir,
diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index 4ef6d26e3..5e5ce9025 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -164,65 +164,62 @@ def run(
 
     article_source = ArticleSource(source)
     all_results: list[dict[str, Any]] = []
-    try:
-        if article_source is ArticleSource.PMC:
-            if mesh_topic_db is None:
-                logger.error("The option --mesh-topics-db is mandatory for source type pmc")
-                return 1
-            mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
-            for path in inputs:
-                logger.info(f"Processing {path}")
-                topic_info = TopicInfo(source=article_source, path=path.resolve())
-                journal_topics = get_topics_for_pmc_article(path)
+    if article_source is ArticleSource.PMC:
+        if mesh_topic_db is None:
+            logger.error("The option --mesh-topics-db is mandatory for source type pmc")
+            return 1
+        mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
+        for path in inputs:
+            logger.info(f"Processing {path}")
+            topic_info = TopicInfo(source=article_source, path=path.resolve())
+            journal_topics = get_topics_for_pmc_article(path)
+            if journal_topics:
+                topic_info.add_journal_topics(
+                    "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
+                )
+            all_results.append(topic_info.json())
+    elif article_source is ArticleSource.PUBMED:
+        if mesh_topic_db is None:
+            logger.error(
+                "The option --mesh-topics-db is mandatory for source type pubmed"
+            )
+            return 1
+        mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
+        for path in inputs:
+            logger.info(f"Processing {path}")
+            articles = ElementTree.parse(input_path)
+            for i, article in enumerate(articles.iter("PubmedArticle")):
+                topic_info = TopicInfo(
+                    source=article_source,
+                    path=path.resolve(),
+                    element_in_file=i,
+                )
+                article_topics = extract_article_topics_for_pubmed_article(article)
+                journal_topics = extract_journal_topics_for_pubmed_article(article)
+                if article_topics:
+                    topic_info.add_article_topics(
+                        "MeSH", mesh.resolve_parents(article_topics, mesh_tree)
+                    )
                 if journal_topics:
                     topic_info.add_journal_topics(
                         "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
                     )
                 all_results.append(topic_info.json())
-        elif article_source is ArticleSource.PUBMED:
-            if mesh_topic_db is None:
-                logger.error(
-                    "The option --mesh-topics-db is mandatory for source type pubmed"
-                )
-                return 1
-            mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
-            for path in inputs:
-                logger.info(f"Processing {path}")
-                articles = ElementTree.parse(input_path)
-                for i, article in enumerate(articles.iter("PubmedArticle")):
-                    topic_info = TopicInfo(
-                        source=article_source,
-                        path=path.resolve(),
-                        element_in_file=i,
-                    )
-                    article_topics = extract_article_topics_for_pubmed_article(article)
-                    journal_topics = extract_journal_topics_for_pubmed_article(article)
-                    if article_topics:
-                        topic_info.add_article_topics(
-                            "MeSH", mesh.resolve_parents(article_topics, mesh_tree)
-                        )
-                    if journal_topics:
-                        topic_info.add_journal_topics(
-                            "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
-                        )
-                    all_results.append(topic_info.json())
-        elif article_source is ArticleSource.ARXIV:
-            for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
-                topic_info = TopicInfo(source=article_source, path=path)
-                topic_info.add_article_topics("arXiv", article_topics)
-                all_results.append(topic_info.json())
-        elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
-            for path in inputs:
-                logger.info(f"Processing {path}")
-                topic, journal = extract_article_topics_from_medrxiv_article(path)
-                topic_info = TopicInfo(source=ArticleSource(journal), path=path)
-                topic_info.add_article_topics("Subject Area", [topic])
-                all_results.append(topic_info.json())
-        else:
-            logger.error(f"The source type {source!r} is not implemented yet")
-            return 1
-    except KeyboardInterrupt:
-        pass
+    elif article_source is ArticleSource.ARXIV:
+        for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
+            topic_info = TopicInfo(source=article_source, path=path)
+            topic_info.add_article_topics("arXiv", article_topics)
+            all_results.append(topic_info.json())
+    elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
+        for path in inputs:
+            logger.info(f"Processing {path}")
+            topic, journal = extract_article_topics_from_medrxiv_article(path)
+            topic_info = TopicInfo(source=ArticleSource(journal), path=path)
+            topic_info.add_article_topics("Subject Area", [topic])
+            all_results.append(topic_info.json())
+    else:
+        logger.error(f"The source type {source!r} is not implemented yet")
+        return 1
 
     JSONL.dump_jsonl(all_results, output_file, overwrite)
 

From 5b647678e347752b0ff25671a83fd15f9d363e7b Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 12:02:50 +0100
Subject: [PATCH 18/78] Fix typo and wrong task dependency

---
 src/bluesearch/entrypoint/database/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 59669e67c..fbbac07d0 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -338,7 +338,7 @@ def requires(self):
         if self.source == "arxiv":
             return self.clone(ConvertPDFTask)
         else:
-            return self.clone(TopicFilterTask)
+            return self.clone(CreateSymlinksTask)
 
     def output(self):
         output_file = Path(self.input().path).parent / "parsed"
@@ -358,9 +358,9 @@ def program_args(self):
         # Determine parser
         source2parser = {
             "arxiv": "tei-xml-arxiv",
-            "biorxiv": "jatx-xml",
-            "medrxiv": "jatx-xml",
-            "pmc": "jatx-xml",
+            "biorxiv": "jats-xml",
+            "medrxiv": "jats-xml",
+            "pmc": "jats-xml",
             "pubmed": "pubmed-xml",
         }
         parser = source2parser[self.source]

From b7ef2caac6fb91fb980f4f201dd0e0bee6c41e03 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 15:37:59 +0100
Subject: [PATCH 19/78] Add small changes

---
 src/bluesearch/entrypoint/database/run.py           | 9 +++++++--
 src/bluesearch/entrypoint/database/topic_extract.py | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index fbbac07d0..43b25963b 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -245,6 +245,11 @@ def program_args(self):
         command = [
             *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, 
         ]
+
+        if self.source in {"medrxiv", "biorxiv"}:
+            command.extend(
+                ["-R", "-m", r".*\.meca$"],
+            )
  
         if self.source in {"pmc", "pubmed"}:
             command.append(f"--mesh-topic-db={self.mesh_topic_db}")
@@ -290,7 +295,7 @@ def run(self):
             input_dir = output_dir.parent / "raw"
 
         filtering = pd.read_csv(filtering_path)
-        accepted = filtering[filtering.accept].path
+        accepted = pd.Series(filtering[filtering.accept].path.unique())
 
         def create_symlink(path):
             input_path = Path(path)
@@ -361,7 +366,7 @@ def program_args(self):
             "biorxiv": "jats-xml",
             "medrxiv": "jats-xml",
             "pmc": "jats-xml",
-            "pubmed": "pubmed-xml",
+            "pubmed": "pubmed-xml-set",
         }
         parser = source2parser[self.source]
 
diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index 5e5ce9025..4a0590b68 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -187,7 +187,7 @@ def run(
         mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
         for path in inputs:
             logger.info(f"Processing {path}")
-            articles = ElementTree.parse(input_path)
+            articles = ElementTree.parse(path)
             for i, article in enumerate(articles.iter("PubmedArticle")):
                 topic_info = TopicInfo(
                     source=article_source,
@@ -214,6 +214,7 @@ def run(
         for path in inputs:
             logger.info(f"Processing {path}")
             topic, journal = extract_article_topics_from_medrxiv_article(path)
+            journal = journal.lower()
             topic_info = TopicInfo(source=ArticleSource(journal), path=path)
             topic_info.add_article_topics("Subject Area", [topic])
             all_results.append(topic_info.json())

From ccd88e02bc039ffdef3e3d297739136a6a867fda Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 16:36:11 +0100
Subject: [PATCH 20/78] Add some docstrings and annotations

---
 src/bluesearch/entrypoint/database/run.py | 39 ++++++++++++++++-------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 43b25963b..99681cd52 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -147,12 +147,17 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 CAPTURE_OUTPUT = False
 
 class DownloadTask(ExternalProgramTask):
+    """Download raw files.
+
+    They will be stored in the `raw/` folder.
+    """
     source = luigi.Parameter()
     from_month = luigi.Parameter()
     output_dir = luigi.Parameter()
 
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
+        """Define download folder."""
         today = datetime.today()
         date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
 
@@ -161,7 +166,8 @@ def output(self):
         return luigi.LocalTarget(str(output_dir))
 
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
+        """Define subprocess arguments."""
         output_dir = self.output().path
         return [
             *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
@@ -171,17 +177,23 @@ def program_args(self):
 
 @requires(DownloadTask)
 class UnzipTask(ExternalProgramTask):
-    """Needs to support unziping of both pubmed and pmc."""
+    """Unzip raw files (if necessary).
+
+    Only applicable in case of `pubmed` and `pmc`. The unzipped files
+    are stored inside of `raw_unzipped`.
+    """
     source = luigi.Parameter()
 
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
+        """Define unzipping folder."""
         input_path = Path(self.input().path)
         output_dir = input_path.parent / "raw_unzipped"
 
         return luigi.LocalTarget(str(output_dir))
 
-    def run(self):
+    def run(self) -> None:
+        """Unzip."""
         input_dir =  Path(self.input().path) # raw
         output_dir = Path(self.output().path)  # raw_unzipped
 
@@ -222,16 +234,24 @@ def run(self):
 
 @inherits(DownloadTask, UnzipTask)
 class TopicExtractTask(ExternalProgramTask):
+    """Topic extraction.
+
+    The input of this dask is either `raw/` or `raw_unzipped/` depending
+    on the source. The output is going to be a single file
+    `topic_infos.jsonl`.
+    """
     source = luigi.Parameter()
     mesh_topic_db = luigi.Parameter()
 
-    def requires(self):
+    def requires(self) -> luigi.Task:
+        """Define conditional dependencies."""
         if self.source in {"pmc", "pubmed"}:
             return self.clone(UnzipTask)
         else:
             return self.clone(DownloadTask)
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
+        """Define output file path."""
         input_dir = self.input()
         output_file = Path(input_dir.path).parent / "topic_infos.jsonl"
 
@@ -239,6 +259,7 @@ def output(self):
 
 
     def program_args(self):
+        """Define subprocess arguments."""
         input_dir = self.input().path
         output_dir = self.output().path
 
@@ -430,10 +451,6 @@ def program_args(self):
  
         return command
 
-
-class worker(luigi.Config):
-    timeout = luigi.IntParameter(5)
-
 def run(
     *,
     source: str,

From 5ad1a75add482c7e19293212d7fa6f94da0b4077 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 16:41:22 +0100
Subject: [PATCH 21/78] Fix the unit test

---
 tests/unit/entrypoint/database/test_run.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 06ba52a22..1c6229603 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -31,17 +31,32 @@
     "source",
     "from_month",
     "filter_config",
+    "output_dir",
+    "db_url",
+    "db_type",
+    "mesh_topic_db",
+    "dry_run",
+    "grobid_host",
+    "grobid_port",
 }
 
 def test_init_parser():
     parser = run.init_parser(argparse.ArgumentParser())
 
-    args = parser.parse_args(["arxiv", "2021-12", "/path/to/config.jsonl"])
+    args = parser.parse_args(
+        [
+            "--source=arxiv",
+            "--from-month=2021-12",
+            "--filter-config=/path/to/config.jsonl",
+            "--output-dir=some/output/dir",
+            "--db-url=some.url"
+        ]
+    )
     assert vars(args).keys() == RUN_PARAMS
 
     # Test the values
     assert args.source == "arxiv"
-    assert args.from_month == datetime.datetime(2021, 12, 1)
+    assert args.from_month == "2021-12"
     assert args.filter_config == pathlib.Path("/path/to/config.jsonl")
 
 

From d186281159136790de3c9ca2576aa87d0deff7be Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 16:56:29 +0100
Subject: [PATCH 22/78] Write additional unit test

---
 src/bluesearch/entrypoint/database/run.py  |  2 +-
 tests/unit/entrypoint/database/test_run.py | 92 ++++++++++++++++++++++
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 99681cd52..ea162a941 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -333,7 +333,7 @@ def create_symlink(path):
 @requires(CreateSymlinksTask)
 class ConvertPDFTask(ExternalProgramTask):
     grobid_host = luigi.Parameter()
-    grobid_port = luigi.Parameter()
+    grobid_port = luigi.IntParameter()
 
 
     def program_args(self):
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 1c6229603..842d58575 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -62,3 +62,95 @@ def test_init_parser():
 
 def test_run_arguments():
     assert inspect.signature(run.run).parameters.keys() == RUN_PARAMS
+
+
+@pytest.mark.parametrize(
+    "source,tasks", 
+    [
+        (
+            "arxiv",
+            (
+                "DownloadTask",
+                "TopicExtractTask",
+                "TopicFilterTask",
+                "CreateSymlinksTask",
+                "ConvertPDFTask",
+                "ParseTask",
+                "AddTask",
+
+            )
+        ),
+        (
+            "biorxiv",
+            (
+                "DownloadTask",
+                "TopicExtractTask",
+                "TopicFilterTask",
+                "CreateSymlinksTask",
+                "ParseTask",
+                "AddTask",
+
+            )
+        ),
+        (
+            "medrxiv",
+            (
+                "DownloadTask",
+                "TopicExtractTask",
+                "TopicFilterTask",
+                "CreateSymlinksTask",
+                "ParseTask",
+                "AddTask",
+
+            )
+        ),
+        (
+            "pmc",
+            (
+                "DownloadTask",
+                "UnzipTask",
+                "TopicExtractTask",
+                "TopicFilterTask",
+                "CreateSymlinksTask",
+                "ParseTask",
+                "AddTask",
+
+            )
+        ),
+        (
+            "pubmed",
+            (
+                "DownloadTask",
+                "UnzipTask",
+                "TopicExtractTask",
+                "TopicFilterTask",
+                "CreateSymlinksTask",
+                "ParseTask",
+                "AddTask",
+
+            )
+        ),
+
+
+    ]
+)
+def test_pipelines(source, tasks, tmp_path, capsys):
+    run.run(
+        source=source,
+        from_month="whatever",
+        filter_config="whatever",
+        output_dir=tmp_path,
+        dry_run=True,
+        mesh_topic_db="whatever",
+        grobid_host="whatever",
+        grobid_port=1234,
+        db_url="whatever",
+        db_type="sqlite",
+    )
+
+    captured = capsys.readouterr()
+    stdout_lines = reversed(captured.out.splitlines()[1:])
+
+    for stdout_line, task in zip(stdout_lines, tasks):
+        assert task in stdout_line
+    

From 91458e485ef8366837ca1fd31634a11251804f33 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 16:56:54 +0100
Subject: [PATCH 23/78] Make black happy

---
 src/bluesearch/entrypoint/database/run.py  | 71 ++++++++++++----------
 tests/unit/entrypoint/database/test_run.py | 25 +++-----
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index ea162a941..22b16bc2c 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -142,20 +142,21 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     return parser
 
 
-BBS_BINARY = ["gtimeout", "--preserve-status", "5" , "bbs_database"]
+BBS_BINARY = ["gtimeout", "--preserve-status", "5", "bbs_database"]
 BBS_BINARY = ["bbs_database"]
 CAPTURE_OUTPUT = False
 
+
 class DownloadTask(ExternalProgramTask):
     """Download raw files.
 
     They will be stored in the `raw/` folder.
     """
+
     source = luigi.Parameter()
     from_month = luigi.Parameter()
     output_dir = luigi.Parameter()
 
-
     def output(self) -> luigi.LocalTarget:
         """Define download folder."""
         today = datetime.today()
@@ -165,16 +166,19 @@ def output(self) -> luigi.LocalTarget:
 
         return luigi.LocalTarget(str(output_dir))
 
-
     def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         output_dir = self.output().path
         return [
-            *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir,
+            *BBS_BINARY,
+            "download",
+            "-v",
+            self.source,
+            self.from_month,
+            output_dir,
         ]
 
 
-
 @requires(DownloadTask)
 class UnzipTask(ExternalProgramTask):
     """Unzip raw files (if necessary).
@@ -182,8 +186,8 @@ class UnzipTask(ExternalProgramTask):
     Only applicable in case of `pubmed` and `pmc`. The unzipped files
     are stored inside of `raw_unzipped`.
     """
-    source = luigi.Parameter()
 
+    source = luigi.Parameter()
 
     def output(self) -> luigi.LocalTarget:
         """Define unzipping folder."""
@@ -194,10 +198,9 @@ def output(self) -> luigi.LocalTarget:
 
     def run(self) -> None:
         """Unzip."""
-        input_dir =  Path(self.input().path) # raw
+        input_dir = Path(self.input().path)  # raw
         output_dir = Path(self.output().path)  # raw_unzipped
 
-        
         output_dir.mkdir(exist_ok=True, parents=True)
         if self.source == "pmc":
             # .tar.gz
@@ -223,15 +226,13 @@ def run(self) -> None:
             for archive in all_zip_files:
                 output_path = output_dir / archive.stem
                 with gzip.open(archive, "rb") as f_in:
-                    with open(output_path,"wb") as f_out:
+                    with open(output_path, "wb") as f_out:
                         shutil.copyfileobj(f_in, f_out)
 
         else:
             raise ValueError(f"Unsupported source {self.source}")
 
 
-
-
 @inherits(DownloadTask, UnzipTask)
 class TopicExtractTask(ExternalProgramTask):
     """Topic extraction.
@@ -240,6 +241,7 @@ class TopicExtractTask(ExternalProgramTask):
     on the source. The output is going to be a single file
     `topic_infos.jsonl`.
     """
+
     source = luigi.Parameter()
     mesh_topic_db = luigi.Parameter()
 
@@ -257,21 +259,25 @@ def output(self) -> luigi.LocalTarget:
 
         return luigi.LocalTarget(str(output_file))
 
-
     def program_args(self):
         """Define subprocess arguments."""
         input_dir = self.input().path
         output_dir = self.output().path
 
         command = [
-            *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, 
+            *BBS_BINARY,
+            "topic-extract",
+            "-v",
+            self.source,
+            input_dir,
+            output_dir,
         ]
 
         if self.source in {"medrxiv", "biorxiv"}:
             command.extend(
                 ["-R", "-m", r".*\.meca$"],
             )
- 
+
         if self.source in {"pmc", "pubmed"}:
             command.append(f"--mesh-topic-db={self.mesh_topic_db}")
 
@@ -292,9 +298,14 @@ def program_args(self):
         output_file = self.output().path
 
         command = [
-            *BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, 
+            *BBS_BINARY,
+            "topic-filter",
+            "-v",
+            extracted_topics,
+            self.filter_config,
+            output_file,
         ]
- 
+
         return command
 
 
@@ -308,7 +319,7 @@ def output(self):
     def run(self):
         output_dir = Path(self.output().path)
         filtering_path = Path(self.input().path)
-        input_dir = output_dir.parent / "raw_unzipped" 
+        input_dir = output_dir.parent / "raw_unzipped"
 
         if (output_dir.parent / "raw_unzipped").exists():
             input_dir = output_dir.parent / "raw_unzipped"
@@ -328,14 +339,11 @@ def create_symlink(path):
         accepted.apply(create_symlink)
 
 
-
-
 @requires(CreateSymlinksTask)
 class ConvertPDFTask(ExternalProgramTask):
     grobid_host = luigi.Parameter()
     grobid_port = luigi.IntParameter()
 
-
     def program_args(self):
         input_dir = Path(self.input().path).parent / "filtered"
         output_dir = self.output().path
@@ -345,11 +353,11 @@ def program_args(self):
             "convert-pdf",
             "-v",
             self.grobid_host,
-            self.grobid_port, 
+            self.grobid_port,
             input_dir,
             f"--output-dir={output_dir}",
         ]
- 
+
         return command
 
     def output(self):
@@ -375,7 +383,6 @@ def program_args(self):
         output_dir = Path(self.output().path)
         output_dir.mkdir(exist_ok=True)
 
-
         if (output_dir.parent / "converted_pdfs").exists():
             input_dir = output_dir.parent / "converted_pdfs"
         else:
@@ -396,10 +403,10 @@ def program_args(self):
             "parse",
             "-v",
             parser,
-            input_dir, 
+            input_dir,
             output_dir,
         ]
- 
+
         return command
 
 
@@ -423,7 +430,9 @@ def complete(self):
         if not input_dir.exists():
             return False
 
-        all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"]
+        all_uids = [
+            article.stem for article in input_dir.iterdir() if article.suffix == ".json"
+        ]
 
         new_uids = []
         for uid in all_uids:
@@ -435,11 +444,9 @@ def complete(self):
 
         return not new_uids
 
-
     def program_args(self):
         input_dir = Path(self.input().path)
 
-
         command = [
             *BBS_BINARY,
             "add",
@@ -448,9 +455,10 @@ def program_args(self):
             "-v",
             f"--db-type={self.db_type}",
         ]
- 
+
         return command
 
+
 def run(
     *,
     source: str,
@@ -471,8 +479,8 @@ def run(
     """
     logger.info("Starting the overall pipeline")
 
-    DownloadTask.capture_output = CAPTURE_OUTPUT 
-    TopicExtractTask.capture_output = CAPTURE_OUTPUT 
+    DownloadTask.capture_output = CAPTURE_OUTPUT
+    TopicExtractTask.capture_output = CAPTURE_OUTPUT
 
     final_task = AddTask(
         source=source,
@@ -486,7 +494,6 @@ def run(
         db_type=db_type,
     )
 
-
     luigi_kwargs = {
         "tasks": [final_task],
         "log_level": "DEBUG",
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 842d58575..162eac5c4 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -40,6 +40,7 @@
     "grobid_port",
 }
 
+
 def test_init_parser():
     parser = run.init_parser(argparse.ArgumentParser())
 
@@ -49,7 +50,7 @@ def test_init_parser():
             "--from-month=2021-12",
             "--filter-config=/path/to/config.jsonl",
             "--output-dir=some/output/dir",
-            "--db-url=some.url"
+            "--db-url=some.url",
         ]
     )
     assert vars(args).keys() == RUN_PARAMS
@@ -65,7 +66,7 @@ def test_run_arguments():
 
 
 @pytest.mark.parametrize(
-    "source,tasks", 
+    "source,tasks",
     [
         (
             "arxiv",
@@ -77,8 +78,7 @@ def test_run_arguments():
                 "ConvertPDFTask",
                 "ParseTask",
                 "AddTask",
-
-            )
+            ),
         ),
         (
             "biorxiv",
@@ -89,8 +89,7 @@ def test_run_arguments():
                 "CreateSymlinksTask",
                 "ParseTask",
                 "AddTask",
-
-            )
+            ),
         ),
         (
             "medrxiv",
@@ -101,8 +100,7 @@ def test_run_arguments():
                 "CreateSymlinksTask",
                 "ParseTask",
                 "AddTask",
-
-            )
+            ),
         ),
         (
             "pmc",
@@ -114,8 +112,7 @@ def test_run_arguments():
                 "CreateSymlinksTask",
                 "ParseTask",
                 "AddTask",
-
-            )
+            ),
         ),
         (
             "pubmed",
@@ -127,12 +124,9 @@ def test_run_arguments():
                 "CreateSymlinksTask",
                 "ParseTask",
                 "AddTask",
-
-            )
+            ),
         ),
-
-
-    ]
+    ],
 )
 def test_pipelines(source, tasks, tmp_path, capsys):
     run.run(
@@ -153,4 +147,3 @@ def test_pipelines(source, tasks, tmp_path, capsys):
 
     for stdout_line, task in zip(stdout_lines, tasks):
         assert task in stdout_line
-    

From 7387257ea60e4c4cd538a5aeede88417fbae996e Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 17:02:02 +0100
Subject: [PATCH 24/78] ADd pending to the check

---
 tests/unit/entrypoint/database/test_run.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 162eac5c4..05bf85e01 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -147,3 +147,4 @@ def test_pipelines(source, tasks, tmp_path, capsys):
 
     for stdout_line, task in zip(stdout_lines, tasks):
         assert task in stdout_line
+        assert "PENDING" in stdout_line

From 3a49d3e72a94b8ce0f7c90d8a69560f4e6d1e84b Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 14 Feb 2022 17:41:50 +0100
Subject: [PATCH 25/78] Configure output capturing

---
 src/bluesearch/entrypoint/database/add.py | 1 +
 src/bluesearch/entrypoint/database/run.py | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/add.py b/src/bluesearch/entrypoint/database/add.py
index 7055f2941..b9f9814b6 100644
--- a/src/bluesearch/entrypoint/database/add.py
+++ b/src/bluesearch/entrypoint/database/add.py
@@ -124,6 +124,7 @@ def run(
     sentence_mappings = []
 
     for article in articles:
+        logger.info(f"Processing {article.uid}")
 
         article_mapping = {
             "article_id": article.uid,
diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 22b16bc2c..b6bee27c3 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -480,7 +480,13 @@ def run(
     logger.info("Starting the overall pipeline")
 
     DownloadTask.capture_output = CAPTURE_OUTPUT
+    UnzipTask.capture_output = CAPTURE_OUTPUT
     TopicExtractTask.capture_output = CAPTURE_OUTPUT
+    TopicFilterTask.capture_output = CAPTURE_OUTPUT
+    CreateSymlinksTask.capture_output = CAPTURE_OUTPUT
+    ConvertPDFTask.capture_output = CAPTURE_OUTPUT
+    ParseTask.capture_output = CAPTURE_OUTPUT
+    AddTask.capture_output = CAPTURE_OUTPUT
 
     final_task = AddTask(
         source=source,
@@ -497,7 +503,7 @@ def run(
     luigi_kwargs = {
         "tasks": [final_task],
         "log_level": "DEBUG",
-        "local_scheduler": True,
+        "local_scheduler": False,
     }
     if dry_run:
         print(print_tree(final_task, last=False))

From 4cc4208f2db88ca6ff47422abde5b55db6031225 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 10:53:11 +0100
Subject: [PATCH 26/78] Add local timeout hack

Requires custom_timeout binary to be in the PATH
---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index b6bee27c3..a418b9a91 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -142,7 +142,6 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     return parser
 
 
-BBS_BINARY = ["gtimeout", "--preserve-status", "5", "bbs_database"]
 BBS_BINARY = ["bbs_database"]
 CAPTURE_OUTPUT = False
 
@@ -170,6 +169,7 @@ def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         output_dir = self.output().path
         return [
+            "custom_timeout",
             *BBS_BINARY,
             "download",
             "-v",

From 7b55f2e9115e2ec969306fb5b5f4a37ee2b09a74 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 10:55:47 +0100
Subject: [PATCH 27/78] Only use local-scheduler

---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index a418b9a91..f28601aaa 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -503,7 +503,7 @@ def run(
     luigi_kwargs = {
         "tasks": [final_task],
         "log_level": "DEBUG",
-        "local_scheduler": False,
+        "local_scheduler": True,
     }
     if dry_run:
         print(print_tree(final_task, last=False))

From 9f23d4c0ec8bdfff07b2193edba5a5811fde19a4 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 11:38:50 +0100
Subject: [PATCH 28/78] Turn entrypoint verbosity into global variable

---
 src/bluesearch/entrypoint/database/run.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index f28601aaa..cb48a5e64 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -143,6 +143,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 
 
 BBS_BINARY = ["bbs_database"]
+VERBOSITY = ["-v"]  # for the entrypoint subprocesses
 CAPTURE_OUTPUT = False
 
 
@@ -172,7 +173,7 @@ def program_args(self) -> list[str]:
             "custom_timeout",
             *BBS_BINARY,
             "download",
-            "-v",
+            *VERBOSITY,
             self.source,
             self.from_month,
             output_dir,
@@ -267,7 +268,7 @@ def program_args(self):
         command = [
             *BBS_BINARY,
             "topic-extract",
-            "-v",
+            *VERBOSITY,
             self.source,
             input_dir,
             output_dir,
@@ -300,7 +301,7 @@ def program_args(self):
         command = [
             *BBS_BINARY,
             "topic-filter",
-            "-v",
+            *VERBOSITY,
             extracted_topics,
             self.filter_config,
             output_file,
@@ -351,7 +352,7 @@ def program_args(self):
         command = [
             *BBS_BINARY,
             "convert-pdf",
-            "-v",
+            *VERBOSITY,
             self.grobid_host,
             self.grobid_port,
             input_dir,
@@ -401,7 +402,7 @@ def program_args(self):
         command = [
             *BBS_BINARY,
             "parse",
-            "-v",
+            *VERBOSITY,
             parser,
             input_dir,
             output_dir,
@@ -450,9 +451,9 @@ def program_args(self):
         command = [
             *BBS_BINARY,
             "add",
+            *VERBOSITY,
             self.db_url,
             input_dir,
-            "-v",
             f"--db-type={self.db_type}",
         ]
 

From 4982824b902447854e64d3631be9de16f663e570 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 12:18:03 +0100
Subject: [PATCH 29/78] Fix source2parse and also postgres complete check

---
 src/bluesearch/entrypoint/database/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index cb48a5e64..d8b3cbfd6 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -392,8 +392,8 @@ def program_args(self):
         # Determine parser
         source2parser = {
             "arxiv": "tei-xml-arxiv",
-            "biorxiv": "jats-xml",
-            "medrxiv": "jats-xml",
+            "biorxiv": "jats-meca",
+            "medrxiv": "jats-meca",
             "pmc": "jats-xml",
             "pubmed": "pubmed-xml-set",
         }
@@ -437,8 +437,8 @@ def complete(self):
 
         new_uids = []
         for uid in all_uids:
-            query = "SELECT article_id from articles WHERE article_id = ?"
-            res = engine.execute(query, (uid,)).fetchall()
+            query = sqlalchemy.text("SELECT article_id from articles WHERE article_id = :uid")
+            res = engine.execute(query, uid=uid).fetchall()
 
             if not res:
                 new_uids.append(uid)

From af47a56f7bf4366e24092b5bf286a35bba6eccf0 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 14:21:19 +0100
Subject: [PATCH 30/78] Add luigi to requirements

---
 requirements.txt | 1 +
 setup.py         | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index effce482e..7cfc95be1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,7 @@ ipython==7.31.1
 ipywidgets==7.6.3
 jupyterlab==3.0.17
 langdetect==1.0.9
+luigi==3.0.3
 mashumaro==3.0
 numpy==1.21.0
 pandas==1.3.0
diff --git a/setup.py b/setup.py
index 5a2133dd8..4058d79bd 100644
--- a/setup.py
+++ b/setup.py
@@ -60,10 +60,11 @@
     "ipywidgets",
     "jupyterlab>=3",
     "langdetect",
-    "numpy>=1.20.1",
-    "pandas>=1",
+    "luigi",
     # Serialization framework on top of dataclasses, e.g. 'Article' to and from JSON.
     "mashumaro>=3.0",
+    "numpy>=1.20.1",
+    "pandas>=1",
     "pg8000",
     "python-dotenv",
     "requests",

From d937dd11595bf40a533b34d5458ddbda84d0fd03 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 14:29:16 +0100
Subject: [PATCH 31/78] Run black

---
 src/bluesearch/entrypoint/database/run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index d8b3cbfd6..53ba06acf 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -437,7 +437,9 @@ def complete(self):
 
         new_uids = []
         for uid in all_uids:
-            query = sqlalchemy.text("SELECT article_id from articles WHERE article_id = :uid")
+            query = sqlalchemy.text(
+                "SELECT article_id from articles WHERE article_id = :uid"
+            )
             res = engine.execute(query, uid=uid).fetchall()
 
             if not res:

From 862358819e76a7435574d68ff43863448ad6d6bb Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 14:36:39 +0100
Subject: [PATCH 32/78] Correct flake8 mistakes

---
 src/bluesearch/entrypoint/database/run.py  | 9 ---------
 tests/unit/entrypoint/database/test_run.py | 4 ----
 2 files changed, 13 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 53ba06acf..31abbf905 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -19,14 +19,11 @@
 
 import argparse
 import gzip
-import json
 import logging
 import shutil
 import tarfile
-import warnings
 from datetime import datetime
 from pathlib import Path
-from typing import Iterator
 
 import luigi
 import pandas as pd
@@ -320,12 +317,6 @@ def output(self):
     def run(self):
         output_dir = Path(self.output().path)
         filtering_path = Path(self.input().path)
-        input_dir = output_dir.parent / "raw_unzipped"
-
-        if (output_dir.parent / "raw_unzipped").exists():
-            input_dir = output_dir.parent / "raw_unzipped"
-        else:
-            input_dir = output_dir.parent / "raw"
 
         filtering = pd.read_csv(filtering_path)
         accepted = pd.Series(filtering[filtering.accept].path.unique())
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 05bf85e01..5694ba232 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -16,16 +16,12 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 
 import argparse
-import datetime
 import inspect
 import pathlib
 
-import numpy as np
-import pandas as pd
 import pytest
 
 from bluesearch.entrypoint.database import run
-from bluesearch.utils import JSONL
 
 RUN_PARAMS = {
     "source",

From db5768dcd700a3f2f4b1730c6c37b16a7226cf86 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 14:38:57 +0100
Subject: [PATCH 33/78] Fix isort problems

---
 src/bluesearch/entrypoint/database/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 31abbf905..d0ea16e7f 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -25,14 +25,14 @@
 from datetime import datetime
 from pathlib import Path
 
-import luigi
 import pandas as pd
 import sqlalchemy
-from luigi.util import inherits, requires
-from luigi.contrib.external_program import ExternalProgramTask
-from luigi.tools.deps_tree import print_tree
 
+import luigi
 from bluesearch.database.article import ArticleSource
+from luigi.contrib.external_program import ExternalProgramTask
+from luigi.tools.deps_tree import print_tree
+from luigi.util import inherits, requires
 
 logger = logging.getLogger(__name__)
 

From 9b19dc605ed58ceb5b4bd29c566c1becf915f08f Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 14:48:26 +0100
Subject: [PATCH 34/78] Fix typing

---
 src/bluesearch/entrypoint/database/run.py  | 8 ++++----
 tests/unit/entrypoint/database/test_run.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index d0ea16e7f..7de7a6c34 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -212,7 +212,7 @@ def run(self) -> None:
                     output_path = output_dir / article.path.rpartition("/")[2]
                     f_in = my_tar.extractfile(article)
                     with open(output_path, "wb") as f_out:
-                        shutil.copyfileobj(f_in, f_out)
+                        shutil.copyfileobj(f_in, f_out)  # type: ignore
                 my_tar.close()
 
         elif self.source == "pubmed":
@@ -223,9 +223,9 @@ def run(self) -> None:
 
             for archive in all_zip_files:
                 output_path = output_dir / archive.stem
-                with gzip.open(archive, "rb") as f_in:
-                    with open(output_path, "wb") as f_out:
-                        shutil.copyfileobj(f_in, f_out)
+                with gzip.open(archive, "rb") as f_in_2:
+                    with open(output_path, "wb") as f_out_2:
+                        shutil.copyfileobj(f_in_2, f_out_2)
 
         else:
             raise ValueError(f"Unsupported source {self.source}")
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 5694ba232..402563d6e 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -128,10 +128,10 @@ def test_pipelines(source, tasks, tmp_path, capsys):
     run.run(
         source=source,
         from_month="whatever",
-        filter_config="whatever",
+        filter_config=pathlib.Path("whatever"),
         output_dir=tmp_path,
         dry_run=True,
-        mesh_topic_db="whatever",
+        mesh_topic_db=pathlib.Path("whatever"),
         grobid_host="whatever",
         grobid_port=1234,
         db_url="whatever",

From 03b433badb81e55bcf0fae3d78325bfec4cc7f30 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 15:02:23 +0100
Subject: [PATCH 35/78] Add more docstrings

---
 src/bluesearch/entrypoint/database/run.py | 40 +++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 7de7a6c34..48b1e9d26 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -284,14 +284,22 @@ def program_args(self):
 
 @requires(TopicExtractTask)
 class TopicFilterTask(ExternalProgramTask):
+    """Run topic filtering entrypoint.
+
+    It inputs `topic_infos.jsonl` and `filter_config` (rules) and it
+    generates a file `filtering.csv`.
+    """
+
     filter_config = luigi.Parameter()
 
     def output(self):
+        """Define output file."""
         output_file = Path(self.input().path).parent / "filtering.csv"
 
         return luigi.LocalTarget(str(output_file))
 
     def program_args(self):
+        """Define subprocess arguments."""
         extracted_topics = self.input().path
         output_file = self.output().path
 
@@ -309,16 +317,22 @@ def program_args(self):
 
 @requires(TopicFilterTask)
 class CreateSymlinksTask(luigi.Task):
+    """Create folder of symlinked articles.
+
+    We only symlink those articles that made it through the topic-filtering
+    stage. The only input is the `filtering.csv`.
+    """
     def output(self):
+        """Define output folder."""
         output_dir = Path(self.input().path).parent / "filtered"
 
         return luigi.LocalTarget(str(output_dir))
 
     def run(self):
+        """Create symlinks."""
         output_dir = Path(self.output().path)
-        filtering_path = Path(self.input().path)
 
-        filtering = pd.read_csv(filtering_path)
+        filtering = pd.read_csv(self.input())
         accepted = pd.Series(filtering[filtering.accept].path.unique())
 
         def create_symlink(path):
@@ -333,10 +347,16 @@ def create_symlink(path):
 
 @requires(CreateSymlinksTask)
 class ConvertPDFTask(ExternalProgramTask):
+    """Convert PDFs to XMLs.
+
+    Assumes that there is a GROBID server up and running. Only necessary
+    when `source=arxiv`. The output is the folder `converted_pdfs/`.
+    """
     grobid_host = luigi.Parameter()
     grobid_port = luigi.IntParameter()
 
     def program_args(self):
+        """Define subprocess arguments."""
         input_dir = Path(self.input().path).parent / "filtered"
         output_dir = self.output().path
 
@@ -353,6 +373,7 @@ def program_args(self):
         return command
 
     def output(self):
+        """Define output folder."""
         output_file = Path(self.input().path).parent / "converted_pdfs"
 
         return luigi.LocalTarget(str(output_file))
@@ -360,18 +381,26 @@ def output(self):
 
 @inherits(ConvertPDFTask, CreateSymlinksTask)
 class ParseTask(ExternalProgramTask):
+    """Parse articles.
+
+    The input is all the articles inside of `filtered/` (or in case of
+    `source="arxiv"` `converted_pdfs/`.
+    """
     def requires(self):
+        """Define conditional dependencies."""
         if self.source == "arxiv":
             return self.clone(ConvertPDFTask)
         else:
             return self.clone(CreateSymlinksTask)
 
     def output(self):
+        """Define output folder."""
         output_file = Path(self.input().path).parent / "parsed"
 
         return luigi.LocalTarget(str(output_file))
 
     def program_args(self):
+        """Define subprocess arguments."""
         output_dir = Path(self.output().path)
         output_dir.mkdir(exist_ok=True)
 
@@ -404,10 +433,16 @@ def program_args(self):
 
 @requires(ParseTask)
 class AddTask(ExternalProgramTask):
+    """Add parsed articles to the database.
+
+    This step is considered done if all articles inside of `parsed/` are
+    already in the database.
+    """
     db_url = luigi.Parameter()
     db_type = luigi.Parameter()
 
     def complete(self):
+        """Check if all articles inside of `parsed/` are in the database."""
         # If all the articles are inside
         if self.db_type == "sqlite":
             prefix = "sqlite:///"
@@ -439,6 +474,7 @@ def complete(self):
         return not new_uids
 
     def program_args(self):
+        """Define subprocess arguments."""
         input_dir = Path(self.input().path)
 
         command = [

From 91c0c71e1006df204d6597d73a3d7d0c627818a2 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 15:02:55 +0100
Subject: [PATCH 36/78] Rerun formatting

---
 src/bluesearch/entrypoint/database/run.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 48b1e9d26..2542600a3 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -322,6 +322,7 @@ class CreateSymlinksTask(luigi.Task):
     We only symlink those articles that made it through the topic-filtering
     stage. The only input is the `filtering.csv`.
     """
+
     def output(self):
         """Define output folder."""
         output_dir = Path(self.input().path).parent / "filtered"
@@ -352,6 +353,7 @@ class ConvertPDFTask(ExternalProgramTask):
     Assumes that there is a GROBID server up and running. Only necessary
     when `source=arxiv`. The output is the folder `converted_pdfs/`.
     """
+
     grobid_host = luigi.Parameter()
     grobid_port = luigi.IntParameter()
 
@@ -386,6 +388,7 @@ class ParseTask(ExternalProgramTask):
     The input is all the articles inside of `filtered/` (or in case of
     `source="arxiv"` `converted_pdfs/`.
     """
+
     def requires(self):
         """Define conditional dependencies."""
         if self.source == "arxiv":
@@ -438,6 +441,7 @@ class AddTask(ExternalProgramTask):
     This step is considered done if all articles inside of `parsed/` are
     already in the database.
     """
+
     db_url = luigi.Parameter()
     db_type = luigi.Parameter()
 

From b4d3d7bd93de1177c2ecb889abc14e43f6ab2b59 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 15:28:32 +0100
Subject: [PATCH 37/78] Nasty global variable date handling

---
 src/bluesearch/entrypoint/database/run.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 2542600a3..dafd52daa 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -142,6 +142,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 BBS_BINARY = ["bbs_database"]
 VERBOSITY = ["-v"]  # for the entrypoint subprocesses
 CAPTURE_OUTPUT = False
+OUTPUT_DIR_RAW = None  # make sure the same datestamp for all tasks
 
 
 class DownloadTask(ExternalProgramTask):
@@ -156,12 +157,14 @@ class DownloadTask(ExternalProgramTask):
 
     def output(self) -> luigi.LocalTarget:
         """Define download folder."""
-        today = datetime.today()
-        date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
+        global OUTPUT_DIR_RAW
+        if OUTPUT_DIR_RAW is None:
+            today = datetime.today()
+            date = f"{self.from_month}_{today.strftime('%Y-%m-%d:%M-%S')}"
 
-        output_dir = Path(self.output_dir) / self.source / date / "raw"
+            OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw"
 
-        return luigi.LocalTarget(str(output_dir))
+        return luigi.LocalTarget(str(OUTPUT_DIR_RAW))
 
     def program_args(self) -> list[str]:
         """Define subprocess arguments."""
@@ -333,7 +336,7 @@ def run(self):
         """Create symlinks."""
         output_dir = Path(self.output().path)
 
-        filtering = pd.read_csv(self.input())
+        filtering = pd.read_csv(self.input().path)
         accepted = pd.Series(filtering[filtering.accept].path.unique())
 
         def create_symlink(path):

From fd3f24dd6b3032fd28a370cabb06fb918a81c5a0 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 16:44:09 +0100
Subject: [PATCH 38/78] Dont consider minutes and seconds

---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index dafd52daa..d8e47861c 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -160,7 +160,7 @@ def output(self) -> luigi.LocalTarget:
         global OUTPUT_DIR_RAW
         if OUTPUT_DIR_RAW is None:
             today = datetime.today()
-            date = f"{self.from_month}_{today.strftime('%Y-%m-%d:%M-%S')}"
+            date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
 
             OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw"
 

From 7b5646957805cf03b315bbe1ae7c873bd37781e8 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 17:16:15 +0100
Subject: [PATCH 39/78] Rename task to be more versatile

---
 src/bluesearch/entrypoint/database/run.py  | 14 +++++++-------
 tests/unit/entrypoint/database/test_run.py | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index d8e47861c..b5f45f70a 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -319,10 +319,10 @@ def program_args(self):
 
 
 @requires(TopicFilterTask)
-class CreateSymlinksTask(luigi.Task):
-    """Create folder of symlinked articles.
+class PerformFilteringTask(luigi.Task):
+    """Create folder that only contains relevant articles.
 
-    We only symlink those articles that made it through the topic-filtering
+    We only consider those articles that made it through the topic-filtering
     stage. The only input is the `filtering.csv`.
     """
 
@@ -349,7 +349,7 @@ def create_symlink(path):
         accepted.apply(create_symlink)
 
 
-@requires(CreateSymlinksTask)
+@requires(PerformFilteringTask)
 class ConvertPDFTask(ExternalProgramTask):
     """Convert PDFs to XMLs.
 
@@ -384,7 +384,7 @@ def output(self):
         return luigi.LocalTarget(str(output_file))
 
 
-@inherits(ConvertPDFTask, CreateSymlinksTask)
+@inherits(ConvertPDFTask, PerformFilteringTask)
 class ParseTask(ExternalProgramTask):
     """Parse articles.
 
@@ -397,7 +397,7 @@ def requires(self):
         if self.source == "arxiv":
             return self.clone(ConvertPDFTask)
         else:
-            return self.clone(CreateSymlinksTask)
+            return self.clone(PerformFilteringTask)
 
     def output(self):
         """Define output folder."""
@@ -520,7 +520,7 @@ def run(
     UnzipTask.capture_output = CAPTURE_OUTPUT
     TopicExtractTask.capture_output = CAPTURE_OUTPUT
     TopicFilterTask.capture_output = CAPTURE_OUTPUT
-    CreateSymlinksTask.capture_output = CAPTURE_OUTPUT
+    PerformFilteringTask.capture_output = CAPTURE_OUTPUT
     ConvertPDFTask.capture_output = CAPTURE_OUTPUT
     ParseTask.capture_output = CAPTURE_OUTPUT
     AddTask.capture_output = CAPTURE_OUTPUT
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 402563d6e..be66fac66 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -70,7 +70,7 @@ def test_run_arguments():
                 "DownloadTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
-                "CreateSymlinksTask",
+                "PerformFilteringTask",
                 "ConvertPDFTask",
                 "ParseTask",
                 "AddTask",
@@ -82,7 +82,7 @@ def test_run_arguments():
                 "DownloadTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
-                "CreateSymlinksTask",
+                "PerformFilteringTask",
                 "ParseTask",
                 "AddTask",
             ),
@@ -93,7 +93,7 @@ def test_run_arguments():
                 "DownloadTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
-                "CreateSymlinksTask",
+                "PerformFilteringTask",
                 "ParseTask",
                 "AddTask",
             ),
@@ -105,7 +105,7 @@ def test_run_arguments():
                 "UnzipTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
-                "CreateSymlinksTask",
+                "PerformFilteringTask",
                 "ParseTask",
                 "AddTask",
             ),
@@ -117,7 +117,7 @@ def test_run_arguments():
                 "UnzipTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
-                "CreateSymlinksTask",
+                "PerformFilteringTask",
                 "ParseTask",
                 "AddTask",
             ),

From fb082c1dc5f856da456e79674c9877e7c2760357 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 17:29:01 +0100
Subject: [PATCH 40/78] Write pseudocode for pubmed peformfilter

---
 src/bluesearch/entrypoint/database/run.py | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index b5f45f70a..01e541756 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -337,16 +337,29 @@ def run(self):
         output_dir = Path(self.output().path)
 
         filtering = pd.read_csv(self.input().path)
-        accepted = pd.Series(filtering[filtering.accept].path.unique())
-
-        def create_symlink(path):
-            input_path = Path(path)
-            output_path = output_dir / input_path.name
-            output_path.symlink_to(input_path)
 
         output_dir.mkdir(exist_ok=True)
 
-        accepted.apply(create_symlink)
+        if self.source == "pubmed":
+            # Find all input files (.xml.gz)
+
+            # Iteratively Load each  of the files in memory
+                # Create a copy of the XML
+                # Remove elements that were not accepted from the copy
+                # Store the copy with removed elements
+
+            # Iteratively zip and save all of the "pruned" copies
+            pass
+
+        else:
+            accepted = pd.Series(filtering[filtering.accept].path.unique())
+            def create_symlink(path):
+                input_path = Path(path)
+                output_path = output_dir / input_path.name
+                output_path.symlink_to(input_path)
+
+
+            accepted.apply(create_symlink)
 
 
 @requires(PerformFilteringTask)

From 1179531db0e931521634351df7092269c09979cf Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 17:33:41 +0100
Subject: [PATCH 41/78] Dont run unzipping for pubmed

---
 src/bluesearch/entrypoint/database/run.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 01e541756..3a1267beb 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -218,18 +218,6 @@ def run(self) -> None:
                         shutil.copyfileobj(f_in, f_out)  # type: ignore
                 my_tar.close()
 
-        elif self.source == "pubmed":
-            # .xml.gz
-            all_zip_files = [p for p in input_dir.iterdir() if p.suffix == ".gz"]
-            if not all_zip_files:
-                raise ValueError("No zip files were found")
-
-            for archive in all_zip_files:
-                output_path = output_dir / archive.stem
-                with gzip.open(archive, "rb") as f_in_2:
-                    with open(output_path, "wb") as f_out_2:
-                        shutil.copyfileobj(f_in_2, f_out_2)
-
         else:
             raise ValueError(f"Unsupported source {self.source}")
 
@@ -248,7 +236,7 @@ class TopicExtractTask(ExternalProgramTask):
 
     def requires(self) -> luigi.Task:
         """Define conditional dependencies."""
-        if self.source in {"pmc", "pubmed"}:
+        if self.source in {"pmc"}:
             return self.clone(UnzipTask)
         else:
             return self.clone(DownloadTask)

From 932478976b5a14339626e6f90b6c0ae6921f0d1d Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 15 Feb 2022 17:57:41 +0100
Subject: [PATCH 42/78] WIP-performfiltering task

---
 src/bluesearch/entrypoint/database/run.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 3a1267beb..1684543e7 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import argparse
+import copy
 import gzip
 import logging
 import shutil
@@ -27,6 +28,7 @@
 
 import pandas as pd
 import sqlalchemy
+from defusedxml import ElementTree
 
 import luigi
 from bluesearch.database.article import ArticleSource
@@ -322,6 +324,7 @@ def output(self):
 
     def run(self):
         """Create symlinks."""
+
         output_dir = Path(self.output().path)
 
         filtering = pd.read_csv(self.input().path)
@@ -330,10 +333,26 @@ def run(self):
 
         if self.source == "pubmed":
             # Find all input files (.xml.gz)
+            all_input_files = [Path(p) for p in filtering["path"].unique()]
 
             # Iteratively Load each  of the files in memory
+            for input_file in all_input_files:
+	            # Unzip it	
+                with gzip.open(input_file) as xml_stream:
+                    article_set = ElementTree.parse(xml_stream)
+
+
                 # Create a copy of the XML
-                # Remove elements that were not accepted from the copy
+                article_set_copy = copy.deepcopy(article_set)
+
+                # Find elements that were not accepted
+                to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])]
+
+                for eif in to_remove["element_in_file"].tolist():
+                    # Remove the corresponding <PubmedArticle> from the copy
+
+
+
                 # Store the copy with removed elements
 
             # Iteratively zip and save all of the "pruned" copies

From 69187d010dccdc960dd748f78c00d1081d3c3109 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 16:30:25 +0100
Subject: [PATCH 43/78] Implement subtree removal logic

---
 src/bluesearch/entrypoint/database/run.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 1684543e7..639144925 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -337,26 +337,29 @@ def run(self):
 
             # Iteratively Load each  of the files in memory
             for input_file in all_input_files:
-	            # Unzip it	
+                # Unzip it
                 with gzip.open(input_file) as xml_stream:
                     article_set = ElementTree.parse(xml_stream)
 
 
                 # Create a copy of the XML
-                article_set_copy = copy.deepcopy(article_set)
+                # article_set_copy = copy.deepcopy(article_set)
+                root = article_set.getroot()
 
                 # Find elements that were not accepted
                 to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])]
-
-                for eif in to_remove["element_in_file"].tolist():
-                    # Remove the corresponding <PubmedArticle> from the copy
+                article_nodes = root.findall("PubmedArticle")
 
 
+                for eif in to_remove["element_in_file"].astype(int).tolist():
+                    # Remove the corresponding <PubmedArticle> from the copy
+                    root.remove(article_nodes[eif])
 
                 # Store the copy with removed elements
+                output_file = output_dir / input_file.stem
+                article_set.write(output_file)
+                # Zipping TODO
 
-            # Iteratively zip and save all of the "pruned" copies
-            pass
 
         else:
             accepted = pd.Series(filtering[filtering.accept].path.unique())

From 55df185bd947b9886fd030a835c2c3b5eff63a4d Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 16:44:28 +0100
Subject: [PATCH 44/78] Make luigi less verbose

---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 639144925..c60899359 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -562,7 +562,7 @@ def run(
 
     luigi_kwargs = {
         "tasks": [final_task],
-        "log_level": "DEBUG",
+        "log_level": "WARNING",
         "local_scheduler": True,
     }
     if dry_run:

From f629ae1af7ce14015d543b3d554838b0d4620447 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:01:16 +0100
Subject: [PATCH 45/78] Fix the immortal bug

---
 src/bluesearch/entrypoint/database/topic_extract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index bce22a350..7a43e21d6 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -188,7 +188,7 @@ def run(
         mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
         for path in inputs:
             logger.info(f"Processing {path}")
-            with gzip.open(input_path) as xml_stream:
+            with gzip.open(path) as xml_stream:
                 articles = ElementTree.parse(xml_stream)
 
             for i, article in enumerate(articles.iter("PubmedArticle")):

From 9702aa3ac7c8167d8e258761431e2c468cccb1a5 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:28:05 +0100
Subject: [PATCH 46/78] Make sure PerformFilteringTask zips pubmed-article-set

---
 src/bluesearch/entrypoint/database/run.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index c60899359..eb32dbd16 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -29,6 +29,7 @@
 import pandas as pd
 import sqlalchemy
 from defusedxml import ElementTree
+from defusedxml.cElementTree import tostring
 
 import luigi
 from bluesearch.database.article import ArticleSource
@@ -356,9 +357,10 @@ def run(self):
                     root.remove(article_nodes[eif])
 
                 # Store the copy with removed elements
-                output_file = output_dir / input_file.stem
-                article_set.write(output_file)
-                # Zipping TODO
+                output_file = output_dir / input_file.name
+                out_bytes = tostring(root)
+                with gzip.open(output_file, 'wb') as f:
+                        f.write(out_bytes)
 
 
         else:

From 857541ddb25a19259b0bb8d45abca017d9463ce5 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:29:31 +0100
Subject: [PATCH 47/78] Run formatting

---
 src/bluesearch/entrypoint/database/run.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index eb32dbd16..637478d8d 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -342,16 +342,16 @@ def run(self):
                 with gzip.open(input_file) as xml_stream:
                     article_set = ElementTree.parse(xml_stream)
 
-
                 # Create a copy of the XML
                 # article_set_copy = copy.deepcopy(article_set)
                 root = article_set.getroot()
 
                 # Find elements that were not accepted
-                to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])]
+                to_remove = filtering[
+                    (filtering["path"] == str(input_file)) & (~filtering["accept"])
+                ]
                 article_nodes = root.findall("PubmedArticle")
 
-
                 for eif in to_remove["element_in_file"].astype(int).tolist():
                     # Remove the corresponding <PubmedArticle> from the copy
                     root.remove(article_nodes[eif])
@@ -359,18 +359,17 @@ def run(self):
                 # Store the copy with removed elements
                 output_file = output_dir / input_file.name
                 out_bytes = tostring(root)
-                with gzip.open(output_file, 'wb') as f:
-                        f.write(out_bytes)
-
+                with gzip.open(output_file, "wb") as f:
+                    f.write(out_bytes)
 
         else:
             accepted = pd.Series(filtering[filtering.accept].path.unique())
+
             def create_symlink(path):
                 input_path = Path(path)
                 output_path = output_dir / input_path.name
                 output_path.symlink_to(input_path)
 
-
             accepted.apply(create_symlink)
 
 

From 81cad764b0549f06faa8e95ae7edd92d3aa4eabe Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:32:13 +0100
Subject: [PATCH 48/78] Make sure unit tests are passing

---
 src/bluesearch/entrypoint/database/run.py  | 2 +-
 tests/unit/entrypoint/database/test_run.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 637478d8d..fc7ec5674 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -29,7 +29,7 @@
 import pandas as pd
 import sqlalchemy
 from defusedxml import ElementTree
-from defusedxml.cElementTree import tostring
+from defusedxml.ElementTree import tostring
 
 import luigi
 from bluesearch.database.article import ArticleSource
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index be66fac66..acd64bf5e 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -114,7 +114,6 @@ def test_run_arguments():
             "pubmed",
             (
                 "DownloadTask",
-                "UnzipTask",
                 "TopicExtractTask",
                 "TopicFilterTask",
                 "PerformFilteringTask",

From 1ad559c1db5b69e6076446621334e5171890fe16 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:40:27 +0100
Subject: [PATCH 49/78] Update sphinx

---
 docs/source/api/bluesearch.entrypoint.database.rst     | 1 +
 docs/source/api/bluesearch.entrypoint.database.run.rst | 7 +++++++
 2 files changed, 8 insertions(+)
 create mode 100644 docs/source/api/bluesearch.entrypoint.database.run.rst

diff --git a/docs/source/api/bluesearch.entrypoint.database.rst b/docs/source/api/bluesearch.entrypoint.database.rst
index 9655b8576..9f3c0c5fe 100644
--- a/docs/source/api/bluesearch.entrypoint.database.rst
+++ b/docs/source/api/bluesearch.entrypoint.database.rst
@@ -14,6 +14,7 @@ Submodules
    bluesearch.entrypoint.database.parent
    bluesearch.entrypoint.database.parse
    bluesearch.entrypoint.database.parse_mesh_rdf
+   bluesearch.entrypoint.database.run
    bluesearch.entrypoint.database.schemas
    bluesearch.entrypoint.database.topic_extract
    bluesearch.entrypoint.database.topic_filter
diff --git a/docs/source/api/bluesearch.entrypoint.database.run.rst b/docs/source/api/bluesearch.entrypoint.database.run.rst
new file mode 100644
index 000000000..3239ab645
--- /dev/null
+++ b/docs/source/api/bluesearch.entrypoint.database.run.rst
@@ -0,0 +1,7 @@
+bluesearch.entrypoint.database.run module
+=========================================
+
+.. automodule:: bluesearch.entrypoint.database.run
+   :members:
+   :undoc-members:
+   :show-inheritance:

From 36b08f4c72c443bcb264a96cd2399bac2fb4f161 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:47:26 +0100
Subject: [PATCH 50/78] Fix linting

---
 src/bluesearch/entrypoint/database/run.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index fc7ec5674..153fbd66f 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -18,7 +18,6 @@
 from __future__ import annotations
 
 import argparse
-import copy
 import gzip
 import logging
 import shutil
@@ -325,7 +324,6 @@ def output(self):
 
     def run(self):
         """Create symlinks."""
-
         output_dir = Path(self.output().path)
 
         filtering = pd.read_csv(self.input().path)

From 6efc98476a39a898a9d7ba0334302ff5f73f322d Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:49:30 +0100
Subject: [PATCH 51/78] Update docstring

---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 tox.ini                                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 153fbd66f..3562fe76e 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -186,7 +186,7 @@ def program_args(self) -> list[str]:
 class UnzipTask(ExternalProgramTask):
     """Unzip raw files (if necessary).
 
-    Only applicable in case of `pubmed` and `pmc`. The unzipped files
+    Only applicable in case of `pmc`. The unzipped files
     are stored inside of `raw_unzipped`.
     """
 
diff --git a/tox.ini b/tox.ini
index 791f75f4a..d0a333ebe 100644
--- a/tox.ini
+++ b/tox.ini
@@ -35,7 +35,7 @@ commands = pytest -m "" {posargs:tests}
 
 [testenv:lint]
 description = Lint using flake8, black, isort and bandit
-basepython = python3.7
+basepython = python3.8
 skip_install = true
 deps =
     bandit==1.7.0
@@ -69,7 +69,7 @@ commands =
 
 [testenv:format]
 description = Apply black and isort
-basepython = python3.7
+basepython = python3.8
 skip_install = true
 deps =
     black==21.5b1

From 164d0338e3c97bac411539e44ac3c5bc8ae8cd31 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 17 Feb 2022 17:55:48 +0100
Subject: [PATCH 52/78] Undo changes in tox.ini

---
 tox.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index d0a333ebe..791f75f4a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -35,7 +35,7 @@ commands = pytest -m "" {posargs:tests}
 
 [testenv:lint]
 description = Lint using flake8, black, isort and bandit
-basepython = python3.8
+basepython = python3.7
 skip_install = true
 deps =
     bandit==1.7.0
@@ -69,7 +69,7 @@ commands =
 
 [testenv:format]
 description = Apply black and isort
-basepython = python3.8
+basepython = python3.7
 skip_install = true
 deps =
     black==21.5b1

From 8f1b8965ef694341c952ee6b746a5d539f5e5a26 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Fri, 18 Feb 2022 09:51:20 +0100
Subject: [PATCH 53/78] Add luigi config

Otherwise imoprts raise deprecation warnings
---
 luigi.cfg | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 luigi.cfg

diff --git a/luigi.cfg b/luigi.cfg
new file mode 100644
index 000000000..b2c955f2b
--- /dev/null
+++ b/luigi.cfg
@@ -0,0 +1,2 @@
+[core]
+    autoload_range=true

From 83006c97d0d146d183260fa83e77fa4fe59854c4 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Fri, 18 Feb 2022 10:18:11 +0100
Subject: [PATCH 54/78] Fix isort

---
 src/bluesearch/entrypoint/database/run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 3562fe76e..5c87f23e0 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -25,17 +25,17 @@
 from datetime import datetime
 from pathlib import Path
 
+import luigi
 import pandas as pd
 import sqlalchemy
 from defusedxml import ElementTree
 from defusedxml.ElementTree import tostring
-
-import luigi
-from bluesearch.database.article import ArticleSource
 from luigi.contrib.external_program import ExternalProgramTask
 from luigi.tools.deps_tree import print_tree
 from luigi.util import inherits, requires
 
+from bluesearch.database.article import ArticleSource
+
 logger = logging.getLogger(__name__)
 
 

From 08f7c7796a75e0b082fe3f5478dbfd0f7fd1add7 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Fri, 18 Feb 2022 11:18:31 +0100
Subject: [PATCH 55/78] Try to fix sphinx warning

---
 docs/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/conf.py b/docs/conf.py
index f8649cf5c..d37550bcc 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -31,6 +31,7 @@
 version = bluesearch.__version__
 
 # -- General configuration ---------------------------------------------------
+suppress_warnings = ["ref.ref"]  # because of luigi.util.requires
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom

From 45fadf2aa862d49aef096867eb994b4127a92d2c Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Fri, 18 Feb 2022 11:19:19 +0100
Subject: [PATCH 56/78] Remove custom_timeout from the source code

However, still very useful locally
---
 src/bluesearch/entrypoint/database/run.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 5c87f23e0..9e6efebee 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -172,7 +172,6 @@ def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         output_dir = self.output().path
         return [
-            "custom_timeout",
             *BBS_BINARY,
             "download",
             *VERBOSITY,

From 09b338e9c0cff119f86c8280caa3324f7241d6ea Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Fri, 18 Feb 2022 12:27:02 +0100
Subject: [PATCH 57/78] Add type annotations everywhere

---
 src/bluesearch/entrypoint/database/run.py | 28 +++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 9e6efebee..88857bfca 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -249,7 +249,7 @@ def output(self) -> luigi.LocalTarget:
 
         return luigi.LocalTarget(str(output_file))
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         input_dir = self.input().path
         output_dir = self.output().path
@@ -284,13 +284,13 @@ class TopicFilterTask(ExternalProgramTask):
 
     filter_config = luigi.Parameter()
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
         """Define output file."""
         output_file = Path(self.input().path).parent / "filtering.csv"
 
         return luigi.LocalTarget(str(output_file))
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         extracted_topics = self.input().path
         output_file = self.output().path
@@ -315,13 +315,13 @@ class PerformFilteringTask(luigi.Task):
     stage. The only input is the `filtering.csv`.
     """
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
         """Define output folder."""
         output_dir = Path(self.input().path).parent / "filtered"
 
         return luigi.LocalTarget(str(output_dir))
 
-    def run(self):
+    def run(self) -> None:
         """Create symlinks."""
         output_dir = Path(self.output().path)
 
@@ -381,7 +381,7 @@ class ConvertPDFTask(ExternalProgramTask):
     grobid_host = luigi.Parameter()
     grobid_port = luigi.IntParameter()
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         input_dir = Path(self.input().path).parent / "filtered"
         output_dir = self.output().path
@@ -398,7 +398,7 @@ def program_args(self):
 
         return command
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
         """Define output folder."""
         output_file = Path(self.input().path).parent / "converted_pdfs"
 
@@ -413,20 +413,20 @@ class ParseTask(ExternalProgramTask):
     `source="arxiv"` `converted_pdfs/`.
     """
 
-    def requires(self):
+    def requires(self) -> luigi.Task:
         """Define conditional dependencies."""
         if self.source == "arxiv":
             return self.clone(ConvertPDFTask)
         else:
             return self.clone(PerformFilteringTask)
 
-    def output(self):
+    def output(self) -> luigi.LocalTarget:
         """Define output folder."""
         output_file = Path(self.input().path).parent / "parsed"
 
         return luigi.LocalTarget(str(output_file))
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         output_dir = Path(self.output().path)
         output_dir.mkdir(exist_ok=True)
@@ -451,8 +451,8 @@ def program_args(self):
             "parse",
             *VERBOSITY,
             parser,
-            input_dir,
-            output_dir,
+            str(input_dir),
+            str(output_dir),
         ]
 
         return command
@@ -469,7 +469,7 @@ class AddTask(ExternalProgramTask):
     db_url = luigi.Parameter()
     db_type = luigi.Parameter()
 
-    def complete(self):
+    def complete(self) -> bool:
         """Check if all articles inside of `parsed/` are in the database."""
         # If all the articles are inside
         if self.db_type == "sqlite":
@@ -501,7 +501,7 @@ def complete(self):
 
         return not new_uids
 
-    def program_args(self):
+    def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         input_dir = Path(self.input().path)
 

From eb8bce61e7fae510dbab8550ee94f30eb6d42801 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 21 Feb 2022 13:59:06 +0100
Subject: [PATCH 58/78] Add custom identifier logic

---
 src/bluesearch/entrypoint/database/run.py  | 16 ++++++++++++++--
 tests/unit/entrypoint/database/test_run.py |  2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 88857bfca..1451ae077 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -137,6 +137,11 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         type=int,
         help="The port of the GROBID server.",
     )
+    parser.add_argument(
+        "--identifier",
+        type=str,
+        help="Custom name of the identifier. If not specified, we use `from-month_today`",
+    )
 
     return parser
 
@@ -156,15 +161,20 @@ class DownloadTask(ExternalProgramTask):
     source = luigi.Parameter()
     from_month = luigi.Parameter()
     output_dir = luigi.Parameter()
+    identifier = luigi.OptionalParameter()
 
     def output(self) -> luigi.LocalTarget:
         """Define download folder."""
         global OUTPUT_DIR_RAW
         if OUTPUT_DIR_RAW is None:
             today = datetime.today()
-            date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
+            if self.identifier is None:
+                identifier = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
+            else:
+                identifier = self.identifier
+
 
-            OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw"
+            OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw"
 
         return luigi.LocalTarget(str(OUTPUT_DIR_RAW))
 
@@ -529,6 +539,7 @@ def run(
     dry_run: bool,
     grobid_host: str | None,
     grobid_port: int | None,
+    identifier: str | None,
 ) -> int:
     """Run overall pipeline.
 
@@ -556,6 +567,7 @@ def run(
         grobid_port=grobid_port,
         db_url=db_url,
         db_type=db_type,
+        identifier=identifier,
     )
 
     luigi_kwargs = {
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index acd64bf5e..bb49fb289 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -34,6 +34,7 @@
     "dry_run",
     "grobid_host",
     "grobid_port",
+    "identifier",
 }
 
 
@@ -135,6 +136,7 @@ def test_pipelines(source, tasks, tmp_path, capsys):
         grobid_port=1234,
         db_url="whatever",
         db_type="sqlite",
+        identifier=None,
     )
 
     captured = capsys.readouterr()

From 2e9e576d29f9ae2057153a9ec1eb72360b57aec6 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 21 Feb 2022 13:59:49 +0100
Subject: [PATCH 59/78] Reformat

---
 src/bluesearch/entrypoint/database/run.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 1451ae077..53e9211df 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -173,7 +173,6 @@ def output(self) -> luigi.LocalTarget:
             else:
                 identifier = self.identifier
 
-
             OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw"
 
         return luigi.LocalTarget(str(OUTPUT_DIR_RAW))

From 3c17bb3773d1aca4931a4de80fe66c1ce3d748f0 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 21 Feb 2022 14:07:54 +0100
Subject: [PATCH 60/78] Break the line

---
 src/bluesearch/entrypoint/database/run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 53e9211df..c6352e179 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -140,7 +140,9 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument(
         "--identifier",
         type=str,
-        help="Custom name of the identifier. If not specified, we use `from-month_today`",
+        help="""Custom name of the identifier. If not specified, we use
+        `from-month_today`
+        """,
     )
 
     return parser

From 7c50ca45e266072da5861afc31cb29000d9aa19c Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 21 Feb 2022 14:10:40 +0100
Subject: [PATCH 61/78] Fix typos

---
 src/bluesearch/entrypoint/database/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index c6352e179..a2866e601 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -238,7 +238,7 @@ def run(self) -> None:
 class TopicExtractTask(ExternalProgramTask):
     """Topic extraction.
 
-    The input of this dask is either `raw/` or `raw_unzipped/` depending
+    The input of this task is either `raw/` or `raw_unzipped/` depending
     on the source. The output is going to be a single file
     `topic_infos.jsonl`.
     """
@@ -344,7 +344,7 @@ def run(self) -> None:
             # Find all input files (.xml.gz)
             all_input_files = [Path(p) for p in filtering["path"].unique()]
 
-            # Iteratively Load each  of the files in memory
+            # Iteratively load each  of the files in memory
             for input_file in all_input_files:
                 # Unzip it
                 with gzip.open(input_file) as xml_stream:

From 927fc89c844744038517f8b0928c5ec16c7f8149 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Mon, 21 Feb 2022 14:14:02 +0100
Subject: [PATCH 62/78] Add forgotten bracket

---
 src/bluesearch/entrypoint/database/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index a2866e601..89ce85082 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -421,7 +421,7 @@ class ParseTask(ExternalProgramTask):
     """Parse articles.
 
     The input is all the articles inside of `filtered/` (or in case of
-    `source="arxiv"` `converted_pdfs/`.
+    `source="arxiv"` `converted_pdfs/`).
     """
 
     def requires(self) -> luigi.Task:

From 25e5fed4d9c59377d82ba8f58c5fa4ddf2e3b797 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 22 Feb 2022 13:51:58 +0100
Subject: [PATCH 63/78] Add recursive enumeration to pubmed

---
 src/bluesearch/entrypoint/database/run.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 89ce85082..020b7b239 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -282,6 +282,12 @@ def program_args(self) -> list[str]:
         if self.source in {"pmc", "pubmed"}:
             command.append(f"--mesh-topic-db={self.mesh_topic_db}")
 
+        if self.source == "pubmed":
+            command.extend(
+                ["-R", "-m", r".*\.xml\.gz$"],
+            )
+
+
         return command
 
 

From 0d5f772d1794f3b8b8aa3d214fd9f7f7b1ee3933 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 22 Feb 2022 13:57:30 +0100
Subject: [PATCH 64/78] Add logging for each element in file

---
 src/bluesearch/entrypoint/database/topic_extract.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index 7a43e21d6..3a116939c 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -192,6 +192,7 @@ def run(
                 articles = ElementTree.parse(xml_stream)
 
             for i, article in enumerate(articles.iter("PubmedArticle")):
+                logger.info(f"Processing element in file {i}")
                 topic_info = TopicInfo(
                     source=article_source,
                     path=path.resolve(),

From cdc0d873145bc3da9297cac6ea6dd07d7fa61739 Mon Sep 17 00:00:00 2001
From: Francesco Casalegno <francesco.casalegno@gmail.com>
Date: Mon, 21 Feb 2022 14:14:28 +0100
Subject: [PATCH 65/78] Skip download for arXiv articles with broken ID or
 version (#586)

---
 src/bluesearch/database/download.py  | 22 ++++++++++++----------
 tests/unit/database/test_download.py | 24 +++++++++++++++++-------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/bluesearch/database/download.py b/src/bluesearch/database/download.py
index 06493351c..b3be9d07e 100644
--- a/src/bluesearch/database/download.py
+++ b/src/bluesearch/database/download.py
@@ -246,21 +246,23 @@ def get_gcs_urls(
 
     client = bucket.client
 
+    def _extract_blob_info(blob: Blob) -> tuple[Blob, str, str, int] | None:
+        try:
+            name = blob.name
+            full_name = blob.name.rsplit("v", 1)[0]
+            article = int(blob.name.rsplit("v", 1)[1].split(".")[0])
+        except ValueError:
+            return None
+        return blob, name, full_name, article
+
     url_dict = {}
     for yearmonth in yearmonth_list:
-        iterator = client.list_blobs(bucket, prefix=f"arxiv/arxiv/pdf/{yearmonth}")
+        all_blobs = client.list_blobs(bucket, prefix=f"arxiv/arxiv/pdf/{yearmonth}")
 
         # If more than one version is found, we only keep the last one
+        blobs_info = (_extract_blob_info(blob) for blob in all_blobs)
         df = pd.DataFrame(
-            (
-                (
-                    el,
-                    el.name,
-                    el.name.rsplit("v", 1)[0],
-                    int(el.name.rsplit("v", 1)[1].split(".")[0]),
-                )
-                for el in iterator
-            ),
+            (info for info in blobs_info if info is not None),
             columns=["blob", "fullname", "article", "version"],
         )
 
diff --git a/tests/unit/database/test_download.py b/tests/unit/database/test_download.py
index 34928e915..3981dc758 100644
--- a/tests/unit/database/test_download.py
+++ b/tests/unit/database/test_download.py
@@ -183,32 +183,42 @@ def test_get_gcs_urls():
     fake_client = Mock()
     fake_bucket = Bucket(fake_client, "my_dir/file.txt")
     fake_blobs_by_prefix = {
+        "arxiv/arxiv/pdf/2109": [
+            Blob("topic-a/99.6767v1.1.pdf", fake_bucket),  # invalid version
+            Blob("topic-v/99.6767v1.2.pdf", fake_bucket),  # invalid version
+            Blob("topic-v/99.6767v1a.pdf", fake_bucket),  # invalid version
+            Blob("topic-v/99.6767v10.pdf", fake_bucket),
+            Blob("topic-v/99.6767v3.pdf", fake_bucket),  # older version
+        ],
         "arxiv/arxiv/pdf/2110": [
-            Blob("topic-a/12.3450v1.pdf", fake_bucket),
+            Blob("topic-a/12.3450v1.pdf", fake_bucket),  # older version
             Blob("topic-v/12.3450v2.pdf", fake_bucket),
         ],
         "arxiv/arxiv/pdf/2111": [
-            Blob("topic-v/99.3450v2.pdf", fake_bucket),
-            Blob("topic-v/99.3450v3.pdf", fake_bucket),
+            Blob("topic-v/99.3450v2.pdf", fake_bucket),  # older version
+            Blob("topic-v/99.3450v3.pdf", fake_bucket),  # older version
             Blob("topic-v/99.3450v10.pdf", fake_bucket),
         ],
         "arxiv/arxiv/pdf/2112": [
             Blob("topic-v/33.1v2.pdf", fake_bucket),
             Blob("topic-v/44.1v2.pdf", fake_bucket),
             Blob("topic-v/55.1v2.pdf", fake_bucket),
-            Blob("topic-v/55.1v1.pdf", fake_bucket),
+            Blob("topic-v/55.1v1.pdf", fake_bucket),  # older version
         ],
     }
 
     fake_client.list_blobs.side_effect = lambda bucket, prefix: fake_blobs_by_prefix[
         prefix
     ]
-    start_date = datetime(2021, 10, 1)
+    start_date = datetime(2021, 9, 1)
     end_date = datetime(2021, 12, 1)
     blobs_by_month = get_gcs_urls(fake_bucket, start_date, end_date)
 
-    assert fake_client.list_blobs.call_count == 3
-    assert set(blobs_by_month) == {"2110", "2111", "2112"}
+    assert fake_client.list_blobs.call_count == 4
+    assert set(blobs_by_month) == {"2109", "2110", "2111", "2112"}
+    assert set(blobs_by_month["2109"]) == set(
+        fake_blobs_by_prefix["arxiv/arxiv/pdf/2109"][-2:-1]
+    )
     assert set(blobs_by_month["2110"]) == set(
         fake_blobs_by_prefix["arxiv/arxiv/pdf/2110"]
     )

From c9dcb395bac93fb4b78b258278c131bfb149ff27 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Wed, 23 Feb 2022 21:58:38 +0100
Subject: [PATCH 66/78] Add separate try except blocks for each source

---
 .../entrypoint/database/topic_extract.py      | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index 3a116939c..dc8dafb76 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -173,7 +173,11 @@ def run(
         for path in inputs:
             logger.info(f"Processing {path}")
             topic_info = TopicInfo(source=article_source, path=path.resolve())
-            journal_topics = get_topics_for_pmc_article(path)
+            try:
+                journal_topics = get_topics_for_pmc_article(path)
+            except Exception:
+                logger.error(f"Failed to extract topic from {path}")
+
             if journal_topics:
                 topic_info.add_journal_topics(
                     "MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
@@ -198,8 +202,12 @@ def run(
                     path=path.resolve(),
                     element_in_file=i,
                 )
-                article_topics = extract_article_topics_for_pubmed_article(article)
-                journal_topics = extract_journal_topics_for_pubmed_article(article)
+                try:
+                    article_topics = extract_article_topics_for_pubmed_article(article)
+                    journal_topics = extract_journal_topics_for_pubmed_article(article)
+                except Exception:
+                    logger.error(f"Failed to extract topic from {i}")
+
                 if article_topics:
                     topic_info.add_article_topics(
                         "MeSH", mesh.resolve_parents(article_topics, mesh_tree)
@@ -212,7 +220,11 @@ def run(
     elif article_source is ArticleSource.ARXIV:
         for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
             topic_info = TopicInfo(source=article_source, path=path)
-            topic_info.add_article_topics("arXiv", article_topics)
+            try:
+                topic_info.add_article_topics("arXiv", article_topics)
+            except Exception:
+                logger.error(f"Failed to extract topic from {path}")
+
             all_results.append(topic_info.json())
     elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
         for path in inputs:
@@ -220,7 +232,11 @@ def run(
             topic, journal = extract_article_topics_from_medrxiv_article(path)
             journal = journal.lower()
             topic_info = TopicInfo(source=ArticleSource(journal), path=path)
-            topic_info.add_article_topics("Subject Area", [topic])
+            try:
+                topic_info.add_article_topics("Subject Area", [topic])
+            except Exception:
+                logger.error(f"Failed to extract topic from {path}")
+
             all_results.append(topic_info.json())
     else:
         logger.error(f"The source type {source!r} is not implemented yet")

From 587e239f651145b16c522f39904a7ae6dd2a4b6a Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Wed, 23 Feb 2022 22:12:51 +0100
Subject: [PATCH 67/78] Fix bug

---
 src/bluesearch/entrypoint/database/topic_extract.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py
index dc8dafb76..b045d364d 100644
--- a/src/bluesearch/entrypoint/database/topic_extract.py
+++ b/src/bluesearch/entrypoint/database/topic_extract.py
@@ -220,22 +220,19 @@ def run(
     elif article_source is ArticleSource.ARXIV:
         for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
             topic_info = TopicInfo(source=article_source, path=path)
-            try:
-                topic_info.add_article_topics("arXiv", article_topics)
-            except Exception:
-                logger.error(f"Failed to extract topic from {path}")
+            topic_info.add_article_topics("arXiv", article_topics)
 
             all_results.append(topic_info.json())
     elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
         for path in inputs:
             logger.info(f"Processing {path}")
-            topic, journal = extract_article_topics_from_medrxiv_article(path)
-            journal = journal.lower()
-            topic_info = TopicInfo(source=ArticleSource(journal), path=path)
             try:
-                topic_info.add_article_topics("Subject Area", [topic])
+                topic, journal = extract_article_topics_from_medrxiv_article(path)
             except Exception:
                 logger.error(f"Failed to extract topic from {path}")
+            journal = journal.lower()
+            topic_info = TopicInfo(source=ArticleSource(journal), path=path)
+            topic_info.add_article_topics("Subject Area", [topic])
 
             all_results.append(topic_info.json())
     else:

From 03e8af18d3238fa16ddab192ca60b39edfc336e8 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Wed, 23 Feb 2022 22:25:41 +0100
Subject: [PATCH 68/78] Format nicely

---
 src/bluesearch/entrypoint/database/run.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 020b7b239..f60bdb098 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -287,7 +287,6 @@ def program_args(self) -> list[str]:
                 ["-R", "-m", r".*\.xml\.gz$"],
             )
 
-
         return command
 
 

From 95847c5175d6e50e9d581d20bde6e140e19efd35 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 24 Feb 2022 16:59:12 +0100
Subject: [PATCH 69/78] Add the possibility of early stoppping

---
 src/bluesearch/entrypoint/database/run.py | 48 +++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index f60bdb098..75f070495 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -144,6 +144,13 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         `from-month_today`
         """,
     )
+    parser.add_argument(
+        "--final-task",
+        type=str,
+        help="""Name of the task where to manually stop the pipeline. Note
+        that the task itself will be included.
+        """,
+    )
 
     return parser
 
@@ -533,6 +540,30 @@ def program_args(self) -> list[str]:
         return command
 
 
+def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]:
+    """Get all dependencies of a given task.
+
+    Parameters
+    ----------
+    task
+        Input task
+
+    Returns
+    -------
+    set[luigi.Task]
+        All the tasks that the `input` depends on.
+    """
+    current_deps = set(task.deps())
+    if not current_deps:
+        return set()
+
+    else:
+        deps = set()
+        for current_dep in current_deps:
+            deps |= get_all_dependencies(current_dep)
+
+        return deps | current_deps
+
 def run(
     *,
     source: str,
@@ -546,6 +577,7 @@ def run(
     grobid_host: str | None,
     grobid_port: int | None,
     identifier: str | None,
+    final_task: str | None,
 ) -> int:
     """Run overall pipeline.
 
@@ -563,7 +595,7 @@ def run(
     ParseTask.capture_output = CAPTURE_OUTPUT
     AddTask.capture_output = CAPTURE_OUTPUT
 
-    final_task = AddTask(
+    add_task_inst = AddTask(
         source=source,
         from_month=from_month,
         filter_config=str(filter_config),
@@ -575,14 +607,24 @@ def run(
         db_type=db_type,
         identifier=identifier,
     )
+    if final_task is None:
+        selected_task_inst = add_task_inst
+    else:
+        all_dependencies = get_all_dependencies(add_task_inst)
+        all_dependencies_map = {t.__class__.__name__: t for t in all_dependencies}
+
+        if final_task in all_dependencies_map:
+            selected_task_inst = all_dependencies_map[final_task]
+        else:
+            raise ValueError(f"Unrecognized final task {final_task}")
 
     luigi_kwargs = {
-        "tasks": [final_task],
+        "tasks": [selected_task_inst],
         "log_level": "WARNING",
         "local_scheduler": True,
     }
     if dry_run:
-        print(print_tree(final_task, last=False))
+        print(print_tree(selected_task_inst, last=False))
     else:
 
         luigi.build(**luigi_kwargs)

From 5e255d429b6ebc8a2c45b3c6e73a2d3c6d1b44ff Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 24 Feb 2022 18:38:51 +0100
Subject: [PATCH 70/78] Small modification

---
 src/bluesearch/entrypoint/database/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 75f070495..9a1ff4b9a 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -551,14 +551,14 @@ def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]:
     Returns
     -------
     set[luigi.Task]
-        All the tasks that the `input` depends on.
+        All the tasks that the `input` depends on including itself.
     """
     current_deps = set(task.deps())
     if not current_deps:
         return set()
 
     else:
-        deps = set()
+        deps = {task}
         for current_dep in current_deps:
             deps |= get_all_dependencies(current_dep)
 

From 93569a30ebadc649da8214edb9c8b0fff7929893 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 24 Feb 2022 19:05:40 +0100
Subject: [PATCH 71/78] Add iffy tests

---
 src/bluesearch/entrypoint/database/run.py  | 21 ++---
 tests/unit/entrypoint/database/test_run.py | 94 ++++++++++++++++++++++
 2 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 9a1ff4b9a..d01cf3438 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -158,7 +158,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 BBS_BINARY = ["bbs_database"]
 VERBOSITY = ["-v"]  # for the entrypoint subprocesses
 CAPTURE_OUTPUT = False
-OUTPUT_DIR_RAW = None  # make sure the same datestamp for all tasks
+IDENTIFIER = None  # make sure the same for all tasks
 
 
 class DownloadTask(ExternalProgramTask):
@@ -174,17 +174,21 @@ class DownloadTask(ExternalProgramTask):
 
     def output(self) -> luigi.LocalTarget:
         """Define download folder."""
-        global OUTPUT_DIR_RAW
-        if OUTPUT_DIR_RAW is None:
-            today = datetime.today()
-            if self.identifier is None:
+        global IDENTIFIER
+        if self.identifier is not None:
+            identifier = self.identifier
+
+        else:
+            if IDENTIFIER is None:
+                today = datetime.today()
                 identifier = f"{self.from_month}_{today.strftime('%Y-%m-%d')}"
+                IDENTIFIER = identifier
             else:
-                identifier = self.identifier
+                identifier = IDENTIFIER
 
-            OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw"
+        output_dir = Path(self.output_dir) / self.source / identifier / "raw"
 
-        return luigi.LocalTarget(str(OUTPUT_DIR_RAW))
+        return luigi.LocalTarget(str(output_dir))
 
     def program_args(self) -> list[str]:
         """Define subprocess arguments."""
@@ -452,7 +456,6 @@ def output(self) -> luigi.LocalTarget:
     def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         output_dir = Path(self.output().path)
-        output_dir.mkdir(exist_ok=True)
 
         if (output_dir.parent / "converted_pdfs").exists():
             input_dir = output_dir.parent / "converted_pdfs"
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index bb49fb289..b40705b92 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -18,6 +18,8 @@
 import argparse
 import inspect
 import pathlib
+from subprocess import Popen
+from unittest.mock import Mock
 
 import pytest
 
@@ -35,6 +37,7 @@
     "grobid_host",
     "grobid_port",
     "identifier",
+    "final_task",
 }
 
 
@@ -137,6 +140,7 @@ def test_pipelines(source, tasks, tmp_path, capsys):
         db_url="whatever",
         db_type="sqlite",
         identifier=None,
+        final_task=None,
     )
 
     captured = capsys.readouterr()
@@ -145,3 +149,93 @@ def test_pipelines(source, tasks, tmp_path, capsys):
     for stdout_line, task in zip(stdout_lines, tasks):
         assert task in stdout_line
         assert "PENDING" in stdout_line
+
+
+@pytest.mark.parametrize(
+    "source", 
+    [
+        "arxiv",
+        "biorxiv",
+        "medrxiv",
+        "pmc",
+        "pubmed",
+    ]
+)
+def test_all(
+    tmp_path,
+    monkeypatch,
+    source,
+):
+    identifier = "ABC"
+    root_dir = tmp_path / source / identifier
+
+    fake_Popen_inst = Mock(spec=Popen)
+    fake_Popen_inst.returncode = 0
+    
+    def create_output(args, **kwargs):
+        entrypoint = args[1]
+
+        if entrypoint == "download":
+            output_path = root_dir / "raw/"
+            output_path.mkdir(parents=True)
+
+        elif entrypoint == "topic-extract":
+            output_path = root_dir / "topic_infos.jsonl"
+            output_path.touch()
+
+        elif entrypoint == "topic-filter":
+            output_path = root_dir / "filtering.csv"
+            output_path.touch()
+
+        elif entrypoint == "convert-pdf":
+            output_path = root_dir / "converted_pdfs/"
+            output_path.mkdir()
+
+        elif entrypoint == "parse":
+            output_path = root_dir / "parsed/"
+            output_path.mkdir()
+
+        elif entrypoint == "add":
+            pass
+
+        return fake_Popen_inst
+
+
+    fake_Popen_class = Mock(side_effect=create_output)
+    monkeypatch.setattr("subprocess.Popen", fake_Popen_class)
+    monkeypatch.setattr(run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir())
+    monkeypatch.setattr(run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir())
+    monkeypatch.setattr(run.AddTask, "complete", lambda _: False)
+
+    run.run(
+        source=source,
+        from_month="1234-11",
+        filter_config=pathlib.Path("aa"),
+        output_dir=tmp_path,
+        dry_run=False,
+        mesh_topic_db=pathlib.Path("whatever"),
+        grobid_host="112431321",
+        grobid_port=8000,
+        db_url="whatever",
+        db_type="sqlite",
+        identifier=identifier,
+        final_task="AddTask",
+    )
+    assert (root_dir / "raw").exists()
+    if source == "pmc":
+        assert (root_dir / "raw_unzipped").exists()
+
+    assert (root_dir / "topic_infos.jsonl").exists()
+    assert (root_dir / "filtering.csv").exists()
+    assert (root_dir / "filtered").exists()
+
+    if source == "arxiv":
+        assert (root_dir / "converted_pdfs").exists()
+
+    assert (root_dir / "parsed").exists()
+
+    if source == "arxiv":
+        assert fake_Popen_class.call_count == 6
+    else:
+        assert fake_Popen_class.call_count == 5
+

From 8620c8707d693e5874650e0faa1eb098f79f7d83 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 24 Feb 2022 19:06:30 +0100
Subject: [PATCH 72/78] Run formatter

---
 src/bluesearch/entrypoint/database/run.py  |  1 +
 tests/unit/entrypoint/database/test_run.py | 16 +++++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index d01cf3438..22ceff5d2 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -567,6 +567,7 @@ def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]:
 
         return deps | current_deps
 
+
 def run(
     *,
     source: str,
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index b40705b92..2e74c2a09 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -152,14 +152,14 @@ def test_pipelines(source, tasks, tmp_path, capsys):
 
 
 @pytest.mark.parametrize(
-    "source", 
+    "source",
     [
         "arxiv",
         "biorxiv",
         "medrxiv",
         "pmc",
         "pubmed",
-    ]
+    ],
 )
 def test_all(
     tmp_path,
@@ -171,7 +171,7 @@ def test_all(
 
     fake_Popen_inst = Mock(spec=Popen)
     fake_Popen_inst.returncode = 0
-    
+
     def create_output(args, **kwargs):
         entrypoint = args[1]
 
@@ -200,11 +200,14 @@ def create_output(args, **kwargs):
 
         return fake_Popen_inst
 
-
     fake_Popen_class = Mock(side_effect=create_output)
     monkeypatch.setattr("subprocess.Popen", fake_Popen_class)
-    monkeypatch.setattr(run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir())
-    monkeypatch.setattr(run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir())
+    monkeypatch.setattr(
+        run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir()
+    )
+    monkeypatch.setattr(
+        run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir()
+    )
     monkeypatch.setattr(run.AddTask, "complete", lambda _: False)
 
     run.run(
@@ -238,4 +241,3 @@ def create_output(args, **kwargs):
         assert fake_Popen_class.call_count == 6
     else:
         assert fake_Popen_class.call_count == 5
-

From efa6821b859c6418839f4123c1fb856e84381b01 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Thu, 24 Feb 2022 19:10:14 +0100
Subject: [PATCH 73/78] Ignore a luigi warning

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 791f75f4a..382503f7e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -153,6 +153,7 @@ testpaths = tests
 filterwarnings =
     error
     ignore::DeprecationWarning:docker.*:
+    ignore::DeprecationWarning:luigi.task:
 addopts =
     --cov
     --cov-config=tox.ini

From 7c815957f5244fe9cf175bc8297b2b382ac6eb7e Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 8 Mar 2022 11:07:15 +0100
Subject: [PATCH 74/78] Use context manager

---
 src/bluesearch/entrypoint/database/run.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 22ceff5d2..4660e0ba1 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -232,14 +232,13 @@ def run(self) -> None:
             all_tar_files = input_dir.rglob("*.tar.gz")
             for archive in all_tar_files:
                 output_path = output_dir / archive.stem
-                my_tar = tarfile.open(archive)
-                all_articles = [x for x in my_tar.getmembers() if x.isfile()]
-                for article in all_articles:
-                    output_path = output_dir / article.path.rpartition("/")[2]
-                    f_in = my_tar.extractfile(article)
-                    with open(output_path, "wb") as f_out:
-                        shutil.copyfileobj(f_in, f_out)  # type: ignore
-                my_tar.close()
+                with tarfile.open(archive) as my_tar:
+                    all_articles = [x for x in my_tar.getmembers() if x.isfile()]
+                    for article in all_articles:
+                        output_path = output_dir / article.path.rpartition("/")[2]
+                        f_in = my_tar.extractfile(article)
+                        with open(output_path, "wb") as f_out:
+                            shutil.copyfileobj(f_in, f_out)  # type: ignore
 
         else:
             raise ValueError(f"Unsupported source {self.source}")

From 95827a23e414cf11b9e03a06bc327e6acc231150 Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Wed, 9 Mar 2022 14:07:52 +0100
Subject: [PATCH 75/78] Move luigi parameters to a config file

---
 luigi.cfg                                 | 2 ++
 src/bluesearch/entrypoint/database/run.py | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/luigi.cfg b/luigi.cfg
index b2c955f2b..8830e6998 100644
--- a/luigi.cfg
+++ b/luigi.cfg
@@ -1,2 +1,4 @@
 [core]
     autoload_range=true
+    log_level = INFO
+    local_scheduler = True
diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 4660e0ba1..824ea85ad 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -623,8 +623,6 @@ def run(
 
     luigi_kwargs = {
         "tasks": [selected_task_inst],
-        "log_level": "WARNING",
-        "local_scheduler": True,
     }
     if dry_run:
         print(print_tree(selected_task_inst, last=False))

From 0ff13e4850165d83fa18cef0b78f667fa8016807 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Tue, 15 Mar 2022 13:10:42 +0100
Subject: [PATCH 76/78] Remove requires/inherits decorator

---
 luigi.cfg                                  |  23 ++
 src/bluesearch/entrypoint/database/run.py  | 265 +++++++--------------
 tests/unit/entrypoint/database/test_run.py |  61 ++---
 3 files changed, 127 insertions(+), 222 deletions(-)

diff --git a/luigi.cfg b/luigi.cfg
index 8830e6998..5f9baf652 100644
--- a/luigi.cfg
+++ b/luigi.cfg
@@ -2,3 +2,26 @@
     autoload_range=true
     log_level = INFO
     local_scheduler = True
+
+[GlobalParams]
+    source=pubmed
+
+[DownloadTask]
+    from_month=2021-12
+    output_dir=luigi-pipeline
+    identifier=
+    ; emtpy string is considered default value
+
+[TopicExtractTask]
+    mesh_topic_db=luigi-pipeline/mesh_topic_db.json
+
+[TopicFilterTask]
+    filter_config=luigi-pipeline/filter-config.jsonl
+
+[ConvertPDFTask]
+    grobid_host=0.0.0.0
+    grobid_port=8070
+
+[AddTask]
+    db_url=luigi-pipeline/my-db.db
+    db_type=sqlite
\ No newline at end of file
diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 824ea85ad..5bfb3361a 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -20,6 +20,8 @@
 import argparse
 import gzip
 import logging
+import pathlib
+import re
 import shutil
 import tarfile
 from datetime import datetime
@@ -32,9 +34,6 @@
 from defusedxml.ElementTree import tostring
 from luigi.contrib.external_program import ExternalProgramTask
 from luigi.tools.deps_tree import print_tree
-from luigi.util import inherits, requires
-
-from bluesearch.database.article import ArticleSource
 
 logger = logging.getLogger(__name__)
 
@@ -56,70 +55,29 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.description = "Run the overall pipeline."
 
     parser.add_argument(
-        "--source",
-        required=True,
-        type=str,
-        choices=[member.value for member in ArticleSource],
-        help="Source of the articles.",
-    )
-    parser.add_argument(
-        "--from-month",
-        required=True,
+        "--final-task",
         type=str,
-        help="The starting month (included) for the download in format YYYY-MM. "
-        "All papers from the given month until today will be downloaded.",
-    )
-    parser.add_argument(
-        "--filter-config",
-        required=True,
-        type=Path,
-        help="""
-        Path to a .JSONL file that defines all the rules for filtering.
-        """,
+        choices=(
+            "DownloadTask",
+            "UnzipTask",
+            "TopicExtractTask",
+            "TopicFilterTask",
+            "PerformFilteringTask",
+            "ConvertPDFTask",
+            "ParseTask",
+            "AddTask",
+        ),
+        help="Final task of the luigi pipeline.",
     )
     parser.add_argument(
-        "--output-dir",
-        required=True,
+        "--config-path",
         type=Path,
-        help="""
-        Path to the output folder. All the results stored under
-        `output_dir/source/date` where date is concatenation of the
-        `from_month` and the day of execution of this command.
-        """,
-    )
-    parser.add_argument(
-        "--db-url",
-        required=True,
-        type=str,
-        help="""
-        The location of the database depending on the database type.
-
-        For MySQL and MariaDB the server URL should be provided, for SQLite the
-        location of the database file. Generally, the scheme part of
-        the URL should be omitted, e.g. for MySQL the URL should be
-        of the form 'my_sql_server.ch:1234/my_database' and for SQLite
-        of the form '/path/to/the/local/database.db'.
-        """,
+        help="Configuration Path.",
     )
     parser.add_argument(
-        "--db-type",
-        default="sqlite",
+        "--luigi-config",
         type=str,
-        choices=("mariadb", "mysql", "postgres", "sqlite"),
-        help="Type of the database.",
-    )
-    parser.add_argument(
-        "--mesh-topic-db",
-        type=Path,
-        help="""
-        The JSON file with MeSH topic hierarchy information. Mandatory for
-        source types "pmc" and "pubmed".
-
-        The JSON file should contain a flat dictionary with MeSH topic tree
-        numbers mapped to the corresponding topic labels. This file can be
-        produced using the `bbs_database parse-mesh-rdf` command. See that
-        command's description for more details.
-        """,
+        help="Configuration parameters.",
     )
     parser.add_argument(
         "--dry-run",
@@ -127,30 +85,6 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         action="store_true",
         help="Prints out a diagram of the pipeline without running it.",
     )
-    parser.add_argument(
-        "--grobid-host",
-        type=str,
-        help="The host of the GROBID server.",
-    )
-    parser.add_argument(
-        "--grobid-port",
-        type=int,
-        help="The port of the GROBID server.",
-    )
-    parser.add_argument(
-        "--identifier",
-        type=str,
-        help="""Custom name of the identifier. If not specified, we use
-        `from-month_today`
-        """,
-    )
-    parser.add_argument(
-        "--final-task",
-        type=str,
-        help="""Name of the task where to manually stop the pipeline. Note
-        that the task itself will be included.
-        """,
-    )
 
     return parser
 
@@ -161,13 +95,18 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 IDENTIFIER = None  # make sure the same for all tasks
 
 
+class GlobalParams(luigi.Config):
+    """Global configuration."""
+
+    source = luigi.Parameter()
+
+
 class DownloadTask(ExternalProgramTask):
     """Download raw files.
 
     They will be stored in the `raw/` folder.
     """
 
-    source = luigi.Parameter()
     from_month = luigi.Parameter()
     output_dir = luigi.Parameter()
     identifier = luigi.OptionalParameter()
@@ -186,7 +125,7 @@ def output(self) -> luigi.LocalTarget:
             else:
                 identifier = IDENTIFIER
 
-        output_dir = Path(self.output_dir) / self.source / identifier / "raw"
+        output_dir = Path(self.output_dir) / GlobalParams().source / identifier / "raw"
 
         return luigi.LocalTarget(str(output_dir))
 
@@ -197,13 +136,12 @@ def program_args(self) -> list[str]:
             *BBS_BINARY,
             "download",
             *VERBOSITY,
-            self.source,
+            GlobalParams().source,
             self.from_month,
             output_dir,
         ]
 
 
-@requires(DownloadTask)
 class UnzipTask(ExternalProgramTask):
     """Unzip raw files (if necessary).
 
@@ -211,7 +149,10 @@ class UnzipTask(ExternalProgramTask):
     are stored inside of `raw_unzipped`.
     """
 
-    source = luigi.Parameter()
+    @staticmethod
+    def requires() -> luigi.Task:
+        """Define dependency."""
+        return DownloadTask()
 
     def output(self) -> luigi.LocalTarget:
         """Define unzipping folder."""
@@ -226,7 +167,7 @@ def run(self) -> None:
         output_dir = Path(self.output().path)  # raw_unzipped
 
         output_dir.mkdir(exist_ok=True, parents=True)
-        if self.source == "pmc":
+        if GlobalParams().source == "pmc":
             # .tar.gz
             # We want collapse the folder hierarchy
             all_tar_files = input_dir.rglob("*.tar.gz")
@@ -241,10 +182,9 @@ def run(self) -> None:
                             shutil.copyfileobj(f_in, f_out)  # type: ignore
 
         else:
-            raise ValueError(f"Unsupported source {self.source}")
+            raise ValueError(f"Unsupported source {GlobalParams().source}")
 
 
-@inherits(DownloadTask, UnzipTask)
 class TopicExtractTask(ExternalProgramTask):
     """Topic extraction.
 
@@ -253,15 +193,15 @@ class TopicExtractTask(ExternalProgramTask):
     `topic_infos.jsonl`.
     """
 
-    source = luigi.Parameter()
     mesh_topic_db = luigi.Parameter()
 
-    def requires(self) -> luigi.Task:
+    @staticmethod
+    def requires() -> luigi.Task:
         """Define conditional dependencies."""
-        if self.source in {"pmc"}:
-            return self.clone(UnzipTask)
+        if GlobalParams().source in {"pmc"}:
+            return UnzipTask()
         else:
-            return self.clone(DownloadTask)
+            return DownloadTask()
 
     def output(self) -> luigi.LocalTarget:
         """Define output file path."""
@@ -279,20 +219,20 @@ def program_args(self) -> list[str]:
             *BBS_BINARY,
             "topic-extract",
             *VERBOSITY,
-            self.source,
+            GlobalParams().source,
             input_dir,
             output_dir,
         ]
 
-        if self.source in {"medrxiv", "biorxiv"}:
+        if GlobalParams().source in {"medrxiv", "biorxiv"}:
             command.extend(
                 ["-R", "-m", r".*\.meca$"],
             )
 
-        if self.source in {"pmc", "pubmed"}:
+        if GlobalParams().source in {"pmc", "pubmed"}:
             command.append(f"--mesh-topic-db={self.mesh_topic_db}")
 
-        if self.source == "pubmed":
+        if GlobalParams().source == "pubmed":
             command.extend(
                 ["-R", "-m", r".*\.xml\.gz$"],
             )
@@ -300,7 +240,6 @@ def program_args(self) -> list[str]:
         return command
 
 
-@requires(TopicExtractTask)
 class TopicFilterTask(ExternalProgramTask):
     """Run topic filtering entrypoint.
 
@@ -310,6 +249,11 @@ class TopicFilterTask(ExternalProgramTask):
 
     filter_config = luigi.Parameter()
 
+    @staticmethod
+    def requires() -> luigi.Task:
+        """Define dependency."""
+        return TopicExtractTask()
+
     def output(self) -> luigi.LocalTarget:
         """Define output file."""
         output_file = Path(self.input().path).parent / "filtering.csv"
@@ -333,7 +277,6 @@ def program_args(self) -> list[str]:
         return command
 
 
-@requires(TopicFilterTask)
 class PerformFilteringTask(luigi.Task):
     """Create folder that only contains relevant articles.
 
@@ -341,6 +284,11 @@ class PerformFilteringTask(luigi.Task):
     stage. The only input is the `filtering.csv`.
     """
 
+    @staticmethod
+    def requires() -> luigi.Task:
+        """Define dependency."""
+        return TopicFilterTask()
+
     def output(self) -> luigi.LocalTarget:
         """Define output folder."""
         output_dir = Path(self.input().path).parent / "filtered"
@@ -355,7 +303,7 @@ def run(self) -> None:
 
         output_dir.mkdir(exist_ok=True)
 
-        if self.source == "pubmed":
+        if GlobalParams().source == "pubmed":
             # Find all input files (.xml.gz)
             all_input_files = [Path(p) for p in filtering["path"].unique()]
 
@@ -396,7 +344,6 @@ def create_symlink(path):
             accepted.apply(create_symlink)
 
 
-@requires(PerformFilteringTask)
 class ConvertPDFTask(ExternalProgramTask):
     """Convert PDFs to XMLs.
 
@@ -407,6 +354,11 @@ class ConvertPDFTask(ExternalProgramTask):
     grobid_host = luigi.Parameter()
     grobid_port = luigi.IntParameter()
 
+    @staticmethod
+    def requires() -> luigi.Task:
+        """Define dependency."""
+        return PerformFilteringTask()
+
     def program_args(self) -> list[str]:
         """Define subprocess arguments."""
         input_dir = Path(self.input().path).parent / "filtered"
@@ -431,7 +383,6 @@ def output(self) -> luigi.LocalTarget:
         return luigi.LocalTarget(str(output_file))
 
 
-@inherits(ConvertPDFTask, PerformFilteringTask)
 class ParseTask(ExternalProgramTask):
     """Parse articles.
 
@@ -439,12 +390,13 @@ class ParseTask(ExternalProgramTask):
     `source="arxiv"` `converted_pdfs/`).
     """
 
-    def requires(self) -> luigi.Task:
+    @staticmethod
+    def requires() -> luigi.Task:
         """Define conditional dependencies."""
-        if self.source == "arxiv":
-            return self.clone(ConvertPDFTask)
+        if GlobalParams().source == "arxiv":
+            return ConvertPDFTask()
         else:
-            return self.clone(PerformFilteringTask)
+            return PerformFilteringTask()
 
     def output(self) -> luigi.LocalTarget:
         """Define output folder."""
@@ -469,7 +421,7 @@ def program_args(self) -> list[str]:
             "pmc": "jats-xml",
             "pubmed": "pubmed-xml-set",
         }
-        parser = source2parser[self.source]
+        parser = source2parser[GlobalParams().source]
 
         command = [
             *BBS_BINARY,
@@ -483,7 +435,6 @@ def program_args(self) -> list[str]:
         return command
 
 
-@requires(ParseTask)
 class AddTask(ExternalProgramTask):
     """Add parsed articles to the database.
 
@@ -494,6 +445,11 @@ class AddTask(ExternalProgramTask):
     db_url = luigi.Parameter()
     db_type = luigi.Parameter()
 
+    @staticmethod
+    def requires() -> luigi.Task:
+        """Define dependency."""
+        return ParseTask()
+
     def complete(self) -> bool:
         """Check if all articles inside of `parsed/` are in the database."""
         # If all the articles are inside
@@ -542,45 +498,12 @@ def program_args(self) -> list[str]:
         return command
 
 
-def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]:
-    """Get all dependencies of a given task.
-
-    Parameters
-    ----------
-    task
-        Input task
-
-    Returns
-    -------
-    set[luigi.Task]
-        All the tasks that the `input` depends on including itself.
-    """
-    current_deps = set(task.deps())
-    if not current_deps:
-        return set()
-
-    else:
-        deps = {task}
-        for current_dep in current_deps:
-            deps |= get_all_dependencies(current_dep)
-
-        return deps | current_deps
-
-
 def run(
     *,
-    source: str,
-    from_month: str,
-    filter_config: Path,
-    output_dir: Path,
-    db_url: str,
-    db_type: str,
-    mesh_topic_db: Path | None,
     dry_run: bool,
-    grobid_host: str | None,
-    grobid_port: int | None,
-    identifier: str | None,
-    final_task: str | None,
+    final_task: str | None = None,
+    config_path: Path | None = None,
+    luigi_config: str | None = None,
 ) -> int:
     """Run overall pipeline.
 
@@ -598,36 +521,28 @@ def run(
     ParseTask.capture_output = CAPTURE_OUTPUT
     AddTask.capture_output = CAPTURE_OUTPUT
 
-    add_task_inst = AddTask(
-        source=source,
-        from_month=from_month,
-        filter_config=str(filter_config),
-        output_dir=str(output_dir),
-        mesh_topic_db=str(mesh_topic_db),
-        grobid_host=grobid_host,
-        grobid_port=grobid_port,
-        db_url=db_url,
-        db_type=db_type,
-        identifier=identifier,
-    )
-    if final_task is None:
-        selected_task_inst = add_task_inst
-    else:
-        all_dependencies = get_all_dependencies(add_task_inst)
-        all_dependencies_map = {t.__class__.__name__: t for t in all_dependencies}
+    if config_path:
+        if not pathlib.Path(config_path).exists():
+            raise ValueError(f"The configuration path {config_path} does not exist!")
 
-        if final_task in all_dependencies_map:
-            selected_task_inst = all_dependencies_map[final_task]
-        else:
-            raise ValueError(f"Unrecognized final task {final_task}")
+        config = luigi.configuration.get_config()
+        config.add_config_path(config_path)
+        config.reload()
 
-    luigi_kwargs = {
-        "tasks": [selected_task_inst],
-    }
-    if dry_run:
-        print(print_tree(selected_task_inst, last=False))
+    if luigi_config:
+        config = luigi.configuration.get_config()
+        for param in luigi_config.split(","):
+            change = re.split(r"[.:]", param, maxsplit=3)
+            config.set(*change)
+
+    if final_task:
+        final_task_call = globals()[final_task]
     else:
+        final_task_call = AddTask
 
-        luigi.build(**luigi_kwargs)
+    if dry_run:
+        print(print_tree(final_task_call(), last=False))
+    else:
+        luigi.build([final_task_call()])
 
     return 0
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 2e74c2a09..0d31de22e 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -17,7 +17,6 @@
 
 import argparse
 import inspect
-import pathlib
 from subprocess import Popen
 from unittest.mock import Mock
 
@@ -26,39 +25,24 @@
 from bluesearch.entrypoint.database import run
 
 RUN_PARAMS = {
-    "source",
-    "from_month",
-    "filter_config",
-    "output_dir",
-    "db_url",
-    "db_type",
-    "mesh_topic_db",
-    "dry_run",
-    "grobid_host",
-    "grobid_port",
-    "identifier",
     "final_task",
+    "config_path",
+    "luigi_config",
+    "dry_run",
 }
 
 
 def test_init_parser():
     parser = run.init_parser(argparse.ArgumentParser())
 
-    args = parser.parse_args(
-        [
-            "--source=arxiv",
-            "--from-month=2021-12",
-            "--filter-config=/path/to/config.jsonl",
-            "--output-dir=some/output/dir",
-            "--db-url=some.url",
-        ]
-    )
+    args = parser.parse_args([])
     assert vars(args).keys() == RUN_PARAMS
 
-    # Test the values
-    assert args.source == "arxiv"
-    assert args.from_month == "2021-12"
-    assert args.filter_config == pathlib.Path("/path/to/config.jsonl")
+    # # Test the values
+    assert args.final_task is None
+    assert args.luigi_config is None
+    assert args.dry_run is False
+    assert args.config_path is None
 
 
 def test_run_arguments():
@@ -129,18 +113,9 @@ def test_run_arguments():
 )
 def test_pipelines(source, tasks, tmp_path, capsys):
     run.run(
-        source=source,
-        from_month="whatever",
-        filter_config=pathlib.Path("whatever"),
-        output_dir=tmp_path,
+        luigi_config=f"GlobalParams.source:{source},"
+        f"DownloadTask.output_dir:{tmp_path}",
         dry_run=True,
-        mesh_topic_db=pathlib.Path("whatever"),
-        grobid_host="whatever",
-        grobid_port=1234,
-        db_url="whatever",
-        db_type="sqlite",
-        identifier=None,
-        final_task=None,
     )
 
     captured = capsys.readouterr()
@@ -211,18 +186,10 @@ def create_output(args, **kwargs):
     monkeypatch.setattr(run.AddTask, "complete", lambda _: False)
 
     run.run(
-        source=source,
-        from_month="1234-11",
-        filter_config=pathlib.Path("aa"),
-        output_dir=tmp_path,
+        luigi_config=f"GlobalParams.source:{source},"
+        f"DownloadTask.output_dir:{tmp_path},"
+        f"DownloadTask.identifier:{identifier}",
         dry_run=False,
-        mesh_topic_db=pathlib.Path("whatever"),
-        grobid_host="112431321",
-        grobid_port=8000,
-        db_url="whatever",
-        db_type="sqlite",
-        identifier=identifier,
-        final_task="AddTask",
     )
     assert (root_dir / "raw").exists()
     if source == "pmc":

From bd77681a87f42973760646f19330e053e4b1b712 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Fri, 18 Mar 2022 08:51:18 +0100
Subject: [PATCH 77/78] Fix linting and add header luigi.cfg

---
 luigi.cfg | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/luigi.cfg b/luigi.cfg
index 5f9baf652..ee1858a82 100644
--- a/luigi.cfg
+++ b/luigi.cfg
@@ -1,27 +1,44 @@
+;Blue Brain Search is a text mining toolbox focused on scientific use cases.
+;
+;Copyright (C) 2020  Blue Brain Project, EPFL.
+;
+;This program is free software: you can redistribute it and/or modify
+;it under the terms of the GNU Lesser General Public License as published by
+;the Free Software Foundation, either version 3 of the License, or
+;(at your option) any later version.
+;
+;This program is distributed in the hope that it will be useful,
+;but WITHOUT ANY WARRANTY; without even the implied warranty of
+;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;GNU Lesser General Public License for more details.
+;
+;You should have received a copy of the GNU Lesser General Public License
+;along with this program. If not, see <https://www.gnu.org/licenses/>.
+
 [core]
-    autoload_range=true
-    log_level = INFO
-    local_scheduler = True
+autoload_range = true
+log_level = INFO
+local_scheduler = True
 
 [GlobalParams]
-    source=pubmed
+source = pubmed
 
 [DownloadTask]
-    from_month=2021-12
-    output_dir=luigi-pipeline
-    identifier=
-    ; emtpy string is considered default value
+from_month = 2021-12
+output_dir = luigi-pipeline
+identifier =
+; emtpy string is considered default value
 
 [TopicExtractTask]
-    mesh_topic_db=luigi-pipeline/mesh_topic_db.json
+mesh_topic_db = luigi-pipeline/mesh_topic_db.json
 
 [TopicFilterTask]
-    filter_config=luigi-pipeline/filter-config.jsonl
+filter_config = luigi-pipeline/filter-config.jsonl
 
 [ConvertPDFTask]
-    grobid_host=0.0.0.0
-    grobid_port=8070
+grobid_host = 0.0.0.0
+grobid_port = 8070
 
 [AddTask]
-    db_url=luigi-pipeline/my-db.db
-    db_type=sqlite
\ No newline at end of file
+db_url = luigi-pipeline/my-db.db
+db_type = sqlite

From 0d343d3af274de31c082bb00259dd1f9a70864f1 Mon Sep 17 00:00:00 2001
From: Emilie Delattre <emilie.delattre@epfl.ch>
Date: Fri, 18 Mar 2022 09:44:06 +0100
Subject: [PATCH 78/78] Add more info about run arguments

---
 src/bluesearch/entrypoint/database/run.py  | 32 ++++++++++++++--------
 tests/unit/entrypoint/database/test_run.py | 12 ++++----
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py
index 5bfb3361a..d2414331c 100644
--- a/src/bluesearch/entrypoint/database/run.py
+++ b/src/bluesearch/entrypoint/database/run.py
@@ -70,14 +70,20 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         help="Final task of the luigi pipeline.",
     )
     parser.add_argument(
-        "--config-path",
+        "--luigi-config-path",
         type=Path,
-        help="Configuration Path.",
+        help="Path to Luigi configuration file. By default, "
+        "luigi is looking into: /etc/luigi/luigi.cfg, luigi.cfg"
+        "and the environment variable LUIGI_CONFIG_PATH."
+        "If a path is specified, it is the one used.",
     )
     parser.add_argument(
-        "--luigi-config",
+        "--luigi-config-args",
         type=str,
-        help="Configuration parameters.",
+        help="Comma separated key-value arguments for Luigi configuration, "
+        "e.g. '--luigi-config GlobalParams.source:arxiv,"
+        "DownloadTask.from-month:2021-04'. Overwrites the content of Luigi "
+        "configuration file (see --luigi-config-path).",
     )
     parser.add_argument(
         "--dry-run",
@@ -502,8 +508,8 @@ def run(
     *,
     dry_run: bool,
     final_task: str | None = None,
-    config_path: Path | None = None,
-    luigi_config: str | None = None,
+    luigi_config_path: Path | None = None,
+    luigi_config_args: str | None = None,
 ) -> int:
     """Run overall pipeline.
 
@@ -521,17 +527,19 @@ def run(
     ParseTask.capture_output = CAPTURE_OUTPUT
     AddTask.capture_output = CAPTURE_OUTPUT
 
-    if config_path:
-        if not pathlib.Path(config_path).exists():
-            raise ValueError(f"The configuration path {config_path} does not exist!")
+    if luigi_config_path:
+        if not pathlib.Path(luigi_config_path).exists():
+            raise ValueError(
+                f"The configuration path {luigi_config_path} " f"does not exist!"
+            )
 
         config = luigi.configuration.get_config()
-        config.add_config_path(config_path)
+        config.add_config_path(luigi_config_path)
         config.reload()
 
-    if luigi_config:
+    if luigi_config_args:
         config = luigi.configuration.get_config()
-        for param in luigi_config.split(","):
+        for param in luigi_config_args.split(","):
             change = re.split(r"[.:]", param, maxsplit=3)
             config.set(*change)
 
diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py
index 0d31de22e..edfee0786 100644
--- a/tests/unit/entrypoint/database/test_run.py
+++ b/tests/unit/entrypoint/database/test_run.py
@@ -26,8 +26,8 @@
 
 RUN_PARAMS = {
     "final_task",
-    "config_path",
-    "luigi_config",
+    "luigi_config_path",
+    "luigi_config_args",
     "dry_run",
 }
 
@@ -40,9 +40,9 @@ def test_init_parser():
 
     # # Test the values
     assert args.final_task is None
-    assert args.luigi_config is None
+    assert args.luigi_config_args is None
     assert args.dry_run is False
-    assert args.config_path is None
+    assert args.luigi_config_path is None
 
 
 def test_run_arguments():
@@ -113,7 +113,7 @@ def test_run_arguments():
 )
 def test_pipelines(source, tasks, tmp_path, capsys):
     run.run(
-        luigi_config=f"GlobalParams.source:{source},"
+        luigi_config_args=f"GlobalParams.source:{source},"
         f"DownloadTask.output_dir:{tmp_path}",
         dry_run=True,
     )
@@ -186,7 +186,7 @@ def create_output(args, **kwargs):
     monkeypatch.setattr(run.AddTask, "complete", lambda _: False)
 
     run.run(
-        luigi_config=f"GlobalParams.source:{source},"
+        luigi_config_args=f"GlobalParams.source:{source},"
         f"DownloadTask.output_dir:{tmp_path},"
         f"DownloadTask.identifier:{identifier}",
         dry_run=False,