From 2b4e454a112b9a86f69dcfe2232fe6e9682066d1 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 8 Feb 2022 10:37:26 +0100 Subject: [PATCH 01/78] First draft of the entrypoint --- src/bluesearch/entrypoint/database/parent.py | 6 + src/bluesearch/entrypoint/database/run.py | 109 +++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 src/bluesearch/entrypoint/database/run.py diff --git a/src/bluesearch/entrypoint/database/parent.py b/src/bluesearch/entrypoint/database/parent.py index 8d392b134..fd2a01c62 100644 --- a/src/bluesearch/entrypoint/database/parent.py +++ b/src/bluesearch/entrypoint/database/parent.py @@ -13,6 +13,7 @@ download, init, parse, + run, topic_extract, topic_filter, ) @@ -71,6 +72,11 @@ def main(argv: Sequence[str] | None = None) -> int: init_parser=parse.init_parser, run=parse.run, ), + "run": Cmd( + help="Run the pipeline.", + init_parser=run.init_parser, + run=run.run, + ), "topic-extract": Cmd( help="Extract topic of article(s).", init_parser=topic_extract.init_parser, diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py new file mode 100644 index 000000000..0f01b3789 --- /dev/null +++ b/src/bluesearch/entrypoint/database/run.py @@ -0,0 +1,109 @@ +# Blue Brain Search is a text mining toolbox focused on scientific use cases. +# +# Copyright (C) 2020 Blue Brain Project, EPFL. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +"""Run the overall pipeline.""" +from __future__ import annotations + +import argparse +import json +import logging +import warnings +from pathlib import Path +from typing import Iterator + +from defusedxml import ElementTree + +from bluesearch.database.article import ArticleSource + +logger = logging.getLogger(__name__) + +def convert_to_datetime(s: str) -> datetime: + """Try to convert a string to a datetime. + + Parameters + ---------- + s + String to be check as a valid date. + + Returns + ------- + datetime + The date specified in the input string. + + Raises + ------ + ArgumentTypeError + When the specified string has not a valid date format. + """ + try: + return datetime.strptime(s, "%Y-%m") + except ValueError: + msg = f"{s} is not a valid date" + raise argparse.ArgumentTypeError(msg) + + +def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + """Initialise the argument parser for the run subcommand. + + Parameters + ---------- + parser + The argument parser to initialise. + + Returns + ------- + argparse.ArgumentParser + The initialised argument parser. The same object as the `parser` + argument. + """ + parser.description = "Run the overall pipeline." + + parser.add_argument( + "source", + type=str, + choices=[member.value for member in ArticleSource], + help="Source of the articles.", + ) + parser.add_argument( + "from_month", + type=convert_to_datetime, + help="The starting month (included) for the download in format YYYY-MM. " + "All papers from the given month until today will be downloaded.", + ) + parser.add_argument( + "filter_config", + type=Path, + help=""" + Path to a .JSONL file that defines all the rules for filtering. + """, + ) + return parser + + +def run( + *, + source: str, + from_month: datetime, + filter_config: Path, +) -> int: + """Run overall pipeline. + + Parameter description and potential defaults are documented inside of the + `get_parser` function. + """ + logger.info("Starting the overall pipeline") + + return 0 From 886c306a93f621d64470f218869c8e8be26fecec Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 8 Feb 2022 10:44:26 +0100 Subject: [PATCH 02/78] Write initial test --- tests/unit/entrypoint/database/test_run.py | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/unit/entrypoint/database/test_run.py diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py new file mode 100644 index 000000000..06ba52a22 --- /dev/null +++ b/tests/unit/entrypoint/database/test_run.py @@ -0,0 +1,49 @@ +# Blue Brain Search is a text mining toolbox focused on scientific use cases. +# +# Copyright (C) 2020 Blue Brain Project, EPFL. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + +import argparse +import datetime +import inspect +import pathlib + +import numpy as np +import pandas as pd +import pytest + +from bluesearch.entrypoint.database import run +from bluesearch.utils import JSONL + +RUN_PARAMS = { + "source", + "from_month", + "filter_config", +} + +def test_init_parser(): + parser = run.init_parser(argparse.ArgumentParser()) + + args = parser.parse_args(["arxiv", "2021-12", "/path/to/config.jsonl"]) + assert vars(args).keys() == RUN_PARAMS + + # Test the values + assert args.source == "arxiv" + assert args.from_month == datetime.datetime(2021, 12, 1) + assert args.filter_config == pathlib.Path("/path/to/config.jsonl") + + +def test_run_arguments(): + assert inspect.signature(run.run).parameters.keys() == RUN_PARAMS From c9c2d4918c0c7a027a7b37cbdc3116f4c631bb94 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 8 Feb 2022 14:24:56 +0100 Subject: [PATCH 03/78] First kind of working version/sketch --- src/bluesearch/entrypoint/database/run.py | 108 +++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 0f01b3789..4ff667bd5 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -21,10 +21,12 @@ import json import logging import warnings +from datetime import datetime from pathlib import Path from typing import Iterator -from defusedxml import ElementTree +import luigi +from luigi.util import inherits, requires from bluesearch.database.article import ArticleSource @@ -93,6 +95,102 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: return parser +FOLDER = Path.cwd() / "luigi" / "temp" +FOLDER.mkdir(exist_ok=True, parents=True) + +class DownloadTask(luigi.Task): + source = luigi.Parameter() + from_month = luigi.DateParameter() + def requires(self): + pass + + def run(self): + print(self.__class__.__name__) + output_file = Path(self.output().path) + output_file.touch() + + def output(self): + output_file = FOLDER / "download_done.txt" + return luigi.LocalTarget(str(output_file)) + + + + +# @inherits(DownloadTask) +@requires(DownloadTask) +class TopicExtractTask(luigi.Task): + source = luigi.Parameter() + + def run(self): + print(self.__class__.__name__) + output_file = Path(self.output().path) + output_file.touch() + + def output(self): + output_file = FOLDER / "extraction_done.txt" + + return luigi.LocalTarget(str(output_file)) + +# @inherits(TopicExtractTask) +@requires(TopicExtractTask) +class TopicFilterTask(luigi.Task): + filter_config = luigi.Parameter() + + def run(self): + print(self.__class__.__name__) + output_file = Path(self.output().path) + output_file.touch() + + def output(self): + output_file = FOLDER / "filtering_done.txt" + + return luigi.LocalTarget(str(output_file)) + +@requires(TopicFilterTask) +class ConvertPDFTask(luigi.Task): + def run(self): + print(self.__class__.__name__) + output_file = Path(self.output().path) + output_file.touch() + + def output(self): + output_file = FOLDER / "converting_pdf_done.txt" + + return luigi.LocalTarget(str(output_file)) + + +@inherits(ConvertPDFTask, TopicFilterTask) +# @requires(TopicFilterTask) +class ParseTask(luigi.Task): + def run(self): + print(self.__class__.__name__) + + output_file = Path(self.output().path) + output_file.touch() + + def requires(self): + if self.source == "arxiv": + return self.clone(ConvertPDFTask) + else: + return self.clone(TopicFilterTask) + + def output(self): + output_file = FOLDER / "parsing_done.txt" + + return luigi.LocalTarget(str(output_file)) + +@requires(ParseTask) +class AddTask(luigi.Task): + def run(self): + print(self.__class__.__name__) + output_file = Path(self.output().path) + output_file.touch() + + def output(self): + output_file = FOLDER / "adding_done.txt" + + return luigi.LocalTarget(str(output_file)) + def run( *, source: str, @@ -106,4 +204,12 @@ def run( """ logger.info("Starting the overall pipeline") + + luigi.build( + [ + AddTask(source=source, from_month=from_month, filter_config=filter_config) + ], + log_level="CRITICAL" + ) + return 0 From 4e561b7d14aa22839842cb31353b3495451410d4 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 8 Feb 2022 15:22:11 +0100 Subject: [PATCH 04/78] Make download task work --- src/bluesearch/entrypoint/database/run.py | 95 +++++++++++++---------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 4ff667bd5..7eee4ebc3 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -27,35 +27,12 @@ import luigi from luigi.util import inherits, requires +from luigi.contrib.external_program import ExternalProgramTask from bluesearch.database.article import ArticleSource logger = logging.getLogger(__name__) -def convert_to_datetime(s: str) -> datetime: - """Try to convert a string to a datetime. - - Parameters - ---------- - s - String to be check as a valid date. - - Returns - ------- - datetime - The date specified in the input string. - - Raises - ------ - ArgumentTypeError - When the specified string has not a valid date format. - """ - try: - return datetime.strptime(s, "%Y-%m") - except ValueError: - msg = f"{s} is not a valid date" - raise argparse.ArgumentTypeError(msg) - def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Initialise the argument parser for the run subcommand. @@ -81,7 +58,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: ) parser.add_argument( "from_month", - type=convert_to_datetime, + type=str, help="The starting month (included) for the download in format YYYY-MM. " "All papers from the given month until today will be downloaded.", ) @@ -92,28 +69,45 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: Path to a .JSONL file that defines all the rules for filtering. """, ) + parser.add_argument( + "output_dir", + type=Path, + help=""" + Path to the output folder. All the results stored under + `output_dir/source/date` where date is concatenation of the + `from_month` and the day of execution of this command. + """, + ) return parser FOLDER = Path.cwd() / "luigi" / "temp" FOLDER.mkdir(exist_ok=True, parents=True) -class DownloadTask(luigi.Task): +BBS_BINARY = "bbs_database" + +class DownloadTask(ExternalProgramTask): source = luigi.Parameter() - from_month = luigi.DateParameter() - def requires(self): - pass + from_month = luigi.Parameter() + output_dir = luigi.Parameter() - def run(self): - print(self.__class__.__name__) - output_file = Path(self.output().path) - output_file.touch() + capture_output=False def output(self): - output_file = FOLDER / "download_done.txt" - return luigi.LocalTarget(str(output_file)) + today = datetime.today() + date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" + + output_dir = Path(self.output_dir) / self.source / date / "raw" + + return luigi.LocalTarget(str(output_dir)) + def program_args(self): + output_dir = self.output().path + return [ + BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, + ] + # @inherits(DownloadTask) @@ -127,7 +121,7 @@ def run(self): output_file.touch() def output(self): - output_file = FOLDER / "extraction_done.txt" + output_file = Path(self.input().path).parent / "extraction_done.txt" return luigi.LocalTarget(str(output_file)) @@ -142,7 +136,7 @@ def run(self): output_file.touch() def output(self): - output_file = FOLDER / "filtering_done.txt" + output_file = Path(self.input().path).parent / "filtering_done.txt" return luigi.LocalTarget(str(output_file)) @@ -154,7 +148,7 @@ def run(self): output_file.touch() def output(self): - output_file = FOLDER / "converting_pdf_done.txt" + output_file = Path(self.input().path).parent / "converting_pdf_done.txt" return luigi.LocalTarget(str(output_file)) @@ -175,7 +169,7 @@ def requires(self): return self.clone(TopicFilterTask) def output(self): - output_file = FOLDER / "parsing_done.txt" + output_file = Path(self.input().path).parent / "parsing_done.txt" return luigi.LocalTarget(str(output_file)) @@ -187,15 +181,23 @@ def run(self): output_file.touch() def output(self): - output_file = FOLDER / "adding_done.txt" + output_file = Path(self.input().path).parent / "adding_done.txt" return luigi.LocalTarget(str(output_file)) +@requires(AddTask) +class ListTask(ExternalProgramTask): + capture_output = False + def program_args(self): + return ["ls", "-alh", "luigi/temp/"] + + def run( *, source: str, - from_month: datetime, + from_month: str, filter_config: Path, + output_dir: Path, ) -> int: """Run overall pipeline. @@ -207,9 +209,16 @@ def run( luigi.build( [ - AddTask(source=source, from_month=from_month, filter_config=filter_config) + AddTask( + source=source, + from_month=from_month, + filter_config=str(filter_config), + output_dir=str(output_dir), + ) + # ListTask(source=source, from_month=from_month, filter_config=filter_config) ], - log_level="CRITICAL" + log_level="INFO", + # log_level="INFO" ) return 0 From 4ae5fa9b111e32570f2a08691cb21a586d6e08e9 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Wed, 9 Feb 2022 21:04:12 +0100 Subject: [PATCH 05/78] Implement unzipping logic --- src/bluesearch/entrypoint/database/run.py | 109 ++++++++++++++++++++-- 1 file changed, 102 insertions(+), 7 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 7eee4ebc3..972a01f6f 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -18,8 +18,11 @@ from __future__ import annotations import argparse +import gzip import json import logging +import shutil +import tarfile import warnings from datetime import datetime from pathlib import Path @@ -78,20 +81,50 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: `from_month` and the day of execution of this command. """, ) - return parser + parser.add_argument( + "db_url", + type=str, + help=""" + The location of the database depending on the database type. + For MySQL and MariaDB the server URL should be provided, for SQLite the + location of the database file. Generally, the scheme part of + the URL should be omitted, e.g. for MySQL the URL should be + of the form 'my_sql_server.ch:1234/my_database' and for SQLite + of the form '/path/to/the/local/database.db'. + """, + ) + parser.add_argument( + "--db-type", + default="sqlite", + type=str, + choices=("mariadb", "mysql", "postgres", "sqlite"), + help="Type of the database.", + ) + parser.add_argument( + "--mesh-topic-db", + type=Path, + help=""" + The JSON file with MeSH topic hierarchy information. Mandatory for + source types "pmc" and "pubmed". + + The JSON file should contain a flat dictionary with MeSH topic tree + numbers mapped to the corresponding topic labels. This file can be + produced using the `bbs_database parse-mesh-rdf` command. See that + command's description for more details. + """, + ) + return parser -FOLDER = Path.cwd() / "luigi" / "temp" -FOLDER.mkdir(exist_ok=True, parents=True) BBS_BINARY = "bbs_database" +CAPTURE_OUTPUT = False class DownloadTask(ExternalProgramTask): source = luigi.Parameter() from_month = luigi.Parameter() output_dir = luigi.Parameter() - capture_output=False def output(self): today = datetime.today() @@ -108,10 +141,58 @@ def program_args(self): BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, ] +@requires(DownloadTask) +class UnzipTask(ExternalProgramTask): + """Needs to support unziping of both pubmed and pmc.""" + source = luigi.Parameter() -# @inherits(DownloadTask) -@requires(DownloadTask) + def output(self): + input_path = Path(self.input().path) + output_dir = input_path.parent / "raw_unzipped" + + return luigi.LocalTarget(str(output_dir)) + + def run(self): + input_dir = Path(self.input().path) # raw + output_dir = Path(self.output().path) # raw_unzipped + + + output_dir.mkdir(exist_ok=True, parents=True) + if self.source == "pmc": + # .tar.gz + # We want collapse the folder hierarchy + all_tar_files = input_dir.rglob("*.tar.gz") + for archive in all_tar_files: + output_path = output_dir / archive.stem + my_tar = tarfile.open(archive) + all_articles = [x for x in my_tar.getmembers() if x.isfile()] + for article in all_articles: + output_path = output_dir / article.path.rpartition("/")[2] + f_in = my_tar.extractfile(article) + with open(output_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + my_tar.close() + + elif self.source == "pubmed": + # .xml.gz + all_zip_files = [p for p in input_dir.iterdir() if p.suffix == ".gz"] + if not all_zip_files: + raise ValueError("No zip files were found") + + for archive in all_zip_files: + output_path = output_dir / archive.stem + with gzip.open(archive, "rb") as f_in: + with open(output_path,"wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + else: + raise ValueError(f"Unsupported source {self.source}") + + + + +@requires(DownloadTask, UnzipTask) class TopicExtractTask(luigi.Task): source = luigi.Parameter() @@ -120,8 +201,15 @@ def run(self): output_file = Path(self.output().path) output_file.touch() + def requires(self): + if self.source in {"pmc", "pubmed"}: + return self.clone(UnzipTask) + else: + return self.clone(DownloadTask) + def output(self): - output_file = Path(self.input().path).parent / "extraction_done.txt" + input_dir = self.input()[0] + output_file = Path(input_dir.path).parent / "extraction_done.txt" return luigi.LocalTarget(str(output_file)) @@ -198,6 +286,9 @@ def run( from_month: str, filter_config: Path, output_dir: Path, + db_url: str, + db_type: str, + mesh_topic_db: Path ) -> int: """Run overall pipeline. @@ -206,6 +297,8 @@ def run( """ logger.info("Starting the overall pipeline") + DownloadTask.capture_output = CAPTURE_OUTPUT + TopicExtractTask.capture_output = CAPTURE_OUTPUT luigi.build( [ @@ -218,6 +311,8 @@ def run( # ListTask(source=source, from_month=from_month, filter_config=filter_config) ], log_level="INFO", + # workers=0, + local_scheduler=True, # prevents the task already in progress errors # log_level="INFO" ) From daeb4d748746b61fadd086d6d9c0b9a0cd04aae2 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 12:38:29 +0100 Subject: [PATCH 06/78] Implement dry run --- src/bluesearch/entrypoint/database/run.py | 47 ++++++++++++++--------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 972a01f6f..7b82821b0 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -31,6 +31,7 @@ import luigi from luigi.util import inherits, requires from luigi.contrib.external_program import ExternalProgramTask +from luigi.tools.deps_tree import print_tree from bluesearch.database.article import ArticleSource @@ -114,6 +115,13 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: command's description for more details. """, ) + parser.add_argument( + "--dry-run", + "-n", + action="store_true", + help="Prints out a diagram of the pipeline without running it.", + ) + return parser @@ -192,7 +200,7 @@ def run(self): -@requires(DownloadTask, UnzipTask) +@inherits(DownloadTask, UnzipTask) class TopicExtractTask(luigi.Task): source = luigi.Parameter() @@ -208,8 +216,8 @@ def requires(self): return self.clone(DownloadTask) def output(self): - input_dir = self.input()[0] - output_file = Path(input_dir.path).parent / "extraction_done.txt" + input_dir = self.input() + output_file = Path(input_dir.path).parent / "topic_infos.jsonl" return luigi.LocalTarget(str(output_file)) @@ -288,7 +296,8 @@ def run( output_dir: Path, db_url: str, db_type: str, - mesh_topic_db: Path + mesh_topic_db: Path, + dry_run: bool ) -> int: """Run overall pipeline. @@ -300,20 +309,22 @@ def run( DownloadTask.capture_output = CAPTURE_OUTPUT TopicExtractTask.capture_output = CAPTURE_OUTPUT - luigi.build( - [ - AddTask( - source=source, - from_month=from_month, - filter_config=str(filter_config), - output_dir=str(output_dir), - ) - # ListTask(source=source, from_month=from_month, filter_config=filter_config) - ], - log_level="INFO", - # workers=0, - local_scheduler=True, # prevents the task already in progress errors - # log_level="INFO" + final_task = AddTask( + source=source, + from_month=from_month, + filter_config=str(filter_config), + output_dir=str(output_dir), ) + luigi_kwargs = { + "tasks": [final_task], + "log_level": "DEBUG", + "local_scheduler": True, + } + if dry_run: + print(print_tree(final_task, last=False)) + else: + + luigi.build(**luigi_kwargs) + return 0 From 621c6bb6a06f3494fe13f9bbbc0f224639f6c9b6 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 12:48:39 +0100 Subject: [PATCH 07/78] Turn positionals into required options Should improve readability --- src/bluesearch/entrypoint/database/run.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 7b82821b0..cfd7f66f4 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -55,26 +55,30 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.description = "Run the overall pipeline." parser.add_argument( - "source", + "--source", + required=True, type=str, choices=[member.value for member in ArticleSource], help="Source of the articles.", ) parser.add_argument( - "from_month", + "--from-month", + required=True, type=str, help="The starting month (included) for the download in format YYYY-MM. " "All papers from the given month until today will be downloaded.", ) parser.add_argument( - "filter_config", + "--filter-config", + required=True, type=Path, help=""" Path to a .JSONL file that defines all the rules for filtering. """, ) parser.add_argument( - "output_dir", + "--output-dir", + required=True, type=Path, help=""" Path to the output folder. All the results stored under @@ -83,7 +87,8 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """, ) parser.add_argument( - "db_url", + "--db-url", + required=True, type=str, help=""" The location of the database depending on the database type. From d3f97ac05a5fd1c244a0e3396a435393ada73861 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 13:46:26 +0100 Subject: [PATCH 08/78] Implement TopicExtractTask --- src/bluesearch/entrypoint/database/run.py | 26 +++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index cfd7f66f4..2107419c2 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -154,6 +154,8 @@ def program_args(self): BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, ] + + @requires(DownloadTask) class UnzipTask(ExternalProgramTask): """Needs to support unziping of both pubmed and pmc.""" @@ -206,13 +208,9 @@ def run(self): @inherits(DownloadTask, UnzipTask) -class TopicExtractTask(luigi.Task): +class TopicExtractTask(ExternalProgramTask): source = luigi.Parameter() - - def run(self): - print(self.__class__.__name__) - output_file = Path(self.output().path) - output_file.touch() + mesh_topic_db = luigi.Parameter() def requires(self): if self.source in {"pmc", "pubmed"}: @@ -226,6 +224,21 @@ def output(self): return luigi.LocalTarget(str(output_file)) + + def program_args(self): + input_dir = self.input().path + output_dir = self.output().path + + command = [ + BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, + ] + + if self.source in {"pmc", "pubmed"}: + command.append(f"--mesh-topic-db={self.mesh_topic_db}") + + return command + + # @inherits(TopicExtractTask) @requires(TopicExtractTask) class TopicFilterTask(luigi.Task): @@ -319,6 +332,7 @@ def run( from_month=from_month, filter_config=str(filter_config), output_dir=str(output_dir), + mesh_topic_db=str(mesh_topic_db), ) luigi_kwargs = { From a4710eb7155d6224595dcc1b78df8885a7f61990 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 14:02:46 +0100 Subject: [PATCH 09/78] Implement topicfiltertask --- src/bluesearch/entrypoint/database/run.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 2107419c2..6dda94e61 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -239,21 +239,25 @@ def program_args(self): return command -# @inherits(TopicExtractTask) @requires(TopicExtractTask) -class TopicFilterTask(luigi.Task): +class TopicFilterTask(ExternalProgramTask): filter_config = luigi.Parameter() - def run(self): - print(self.__class__.__name__) - output_file = Path(self.output().path) - output_file.touch() - def output(self): - output_file = Path(self.input().path).parent / "filtering_done.txt" + output_file = Path(self.input().path).parent / "filtering.csv" return luigi.LocalTarget(str(output_file)) + def program_args(self): + extracted_topics = self.input().path + output_file = self.output().path + + command = [ + BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, + ] + + return command + @requires(TopicFilterTask) class ConvertPDFTask(luigi.Task): def run(self): From a1b0e86bbe2c164fc29c3289ef18ec69f6e76174 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 14:58:33 +0100 Subject: [PATCH 10/78] Add create symlinks task --- src/bluesearch/entrypoint/database/run.py | 80 ++++++++++++++++++++--- 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 6dda94e61..c3217efc0 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -29,6 +29,7 @@ from typing import Iterator import luigi +import pandas as pd from luigi.util import inherits, requires from luigi.contrib.external_program import ExternalProgramTask from luigi.tools.deps_tree import print_tree @@ -126,6 +127,16 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: action="store_true", help="Prints out a diagram of the pipeline without running it.", ) + parser.add_argument( + "--grobid-host", + type=str, + help="The host of the GROBID server.", + ) + parser.add_argument( + "--grobid-port", + type=int, + help="The port of the GROBID server.", + ) return parser @@ -258,21 +269,68 @@ def program_args(self): return command + @requires(TopicFilterTask) -class ConvertPDFTask(luigi.Task): +class CreateSymlinksTask(luigi.Task): + def output(self): + output_dir = Path(self.input().path).parent / "filtered" + + return luigi.LocalTarget(str(output_dir)) + def run(self): - print(self.__class__.__name__) - output_file = Path(self.output().path) - output_file.touch() + output_dir = Path(self.output().path) + filtering_path = Path(self.input().path) + input_dir = output_dir.parent / "raw_unzipped" + + if (output_dir.parent / "raw_unzipped").exists(): + input_dir = output_dir.parent / "raw_unzipped" + else: + input_dir = output_dir.parent / "raw" + + filtering = pd.read_csv(filtering_path) + accepted = filtering[filtering.accept].path + + def create_symlink(path): + input_path = Path(path) + output_path = output_dir / input_path.name + output_path.symlink_to(input_path) + + output_dir.mkdir(exist_ok=True) + + accepted.apply(create_symlink) + + + + +@requires(CreateSymlinksTask) +class ConvertPDFTask(ExternalProgramTask): + grobid_host = luigi.Parameter() + grobid_port = luigi.Parameter() + + + def program_args(self): + input_dir = Path(self.input().path).parent / "raw" + output_dir = self.output().path + + command = [ + BBS_BINARY, + "convert-pdf", + "-v", + self.grobid_host, + self.grobid_port, + input_dir, + f"--output_dir={output_dir}", + ] + + return command def output(self): - output_file = Path(self.input().path).parent / "converting_pdf_done.txt" + output_file = Path(self.input().path).parent / "converted_pdfs" return luigi.LocalTarget(str(output_file)) -@inherits(ConvertPDFTask, TopicFilterTask) -# @requires(TopicFilterTask) +@inherits(ConvertPDFTask, CreateSymlinksTask) class ParseTask(luigi.Task): def run(self): print(self.__class__.__name__) @@ -318,8 +376,10 @@ def run( output_dir: Path, db_url: str, db_type: str, - mesh_topic_db: Path, - dry_run: bool + mesh_topic_db: Path | None, + dry_run: bool, + grobid_host: str | None, + grobid_port: int | None, ) -> int: """Run overall pipeline. @@ -337,6 +397,8 @@ def run( filter_config=str(filter_config), output_dir=str(output_dir), mesh_topic_db=str(mesh_topic_db), + grobid_host=grobid_host, + grobid_port=grobid_port, ) luigi_kwargs = { From 2d7c068ed6b844da105082479dbb75e2e3279586 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 15:25:39 +0100 Subject: [PATCH 11/78] Implement convertpdf task --- src/bluesearch/entrypoint/database/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index c3217efc0..a4dfc5c6e 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -309,7 +309,7 @@ class ConvertPDFTask(ExternalProgramTask): def program_args(self): - input_dir = Path(self.input().path).parent / "raw" + input_dir = Path(self.input().path).parent / "filtered" output_dir = self.output().path command = [ @@ -319,7 +319,7 @@ def program_args(self): self.grobid_host, self.grobid_port, input_dir, - f"--output_dir={output_dir}", + f"--output-dir={output_dir}", ] return command From 2a37dea7cc494fa8ee23ef442db3fffe8a01ba5b Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 15:39:24 +0100 Subject: [PATCH 12/78] Implement parse task --- src/bluesearch/entrypoint/database/run.py | 41 ++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index a4dfc5c6e..1b0161722 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -331,13 +331,7 @@ def output(self): @inherits(ConvertPDFTask, CreateSymlinksTask) -class ParseTask(luigi.Task): - def run(self): - print(self.__class__.__name__) - - output_file = Path(self.output().path) - output_file.touch() - +class ParseTask(ExternalProgramTask): def requires(self): if self.source == "arxiv": return self.clone(ConvertPDFTask) @@ -345,10 +339,41 @@ def requires(self): return self.clone(TopicFilterTask) def output(self): - output_file = Path(self.input().path).parent / "parsing_done.txt" + output_file = Path(self.input().path).parent / "parsed" return luigi.LocalTarget(str(output_file)) + def program_args(self): + output_dir = Path(self.output().path) + output_dir.mkdir(exist_ok=True) + + + if (output_dir.parent / "converted_pdfs").exists(): + input_dir = output_dir.parent / "converted_pdfs" + else: + input_dir = output_dir.parent / "filtered" + + # Determine parser + source2parser = { + "arxiv": "tei-xml-arxiv", + "biorxiv": "jatx-xml", + "medrxiv": "jatx-xml", + "pmc": "jatx-xml", + "pubmed": "pubmed-xml", + } + parser = source2parser[self.source] + + command = [ + BBS_BINARY, + "parse", + "-v", + parser, + input_dir, + output_dir, + ] + + return command + @requires(ParseTask) class AddTask(luigi.Task): def run(self): From e34439b1c0944f0bda6311878a482956fad4fa0d Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 10 Feb 2022 16:38:45 +0100 Subject: [PATCH 13/78] Implement AddTask --- src/bluesearch/entrypoint/database/run.py | 55 +++++++++++++++++++---- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 1b0161722..451e6dc73 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -30,6 +30,7 @@ import luigi import pandas as pd +import sqlalchemy from luigi.util import inherits, requires from luigi.contrib.external_program import ExternalProgramTask from luigi.tools.deps_tree import print_tree @@ -374,17 +375,53 @@ def program_args(self): return command + @requires(ParseTask) -class AddTask(luigi.Task): - def run(self): - print(self.__class__.__name__) - output_file = Path(self.output().path) - output_file.touch() +class AddTask(ExternalProgramTask): + db_url = luigi.Parameter() + db_type = luigi.Parameter() + + def complete(self): + # If all the articles are inside + if self.db_type == "sqlite": + prefix = "sqlite:///" + elif self.db_type == "postgres": + prefix = "postgresql+pg8000://" + else: + raise ValueError + + engine = sqlalchemy.create_engine(f"{prefix}{self.db_url}") + + input_dir = Path(self.input().path) + all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"] + + new_uids = [] + for uid in all_uids: + query = "SELECT article_id from articles WHERE article_id = ?" + res = engine.execute(query, (uid,)).fetchall() + + if not res: + new_uids.append(uid) + + return not new_uids + + + def program_args(self): + input_dir = Path(self.input().path) + + + command = [ + BBS_BINARY, + "add", + self.db_url, + input_dir, + "-v", + f"--db-type={self.db_type}", + ] + + return command - def output(self): - output_file = Path(self.input().path).parent / "adding_done.txt" - return luigi.LocalTarget(str(output_file)) @requires(AddTask) class ListTask(ExternalProgramTask): @@ -424,6 +461,8 @@ def run( mesh_topic_db=str(mesh_topic_db), grobid_host=grobid_host, grobid_port=grobid_port, + db_url=db_url, + db_type=db_type, ) luigi_kwargs = { From 06df235631a5be8833dbb79fa9f13473f1a442f9 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 10:18:54 +0100 Subject: [PATCH 14/78] Improve logic in custom compleete --- src/bluesearch/entrypoint/database/run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 451e6dc73..6e47df0a4 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -393,6 +393,9 @@ def complete(self): engine = sqlalchemy.create_engine(f"{prefix}{self.db_url}") input_dir = Path(self.input().path) + if not input_dir.exists(): + return False + all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"] new_uids = [] From 973b284b246a32af99165f110fcf6b28153bbe4a Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 10:36:31 +0100 Subject: [PATCH 15/78] Handle keyboardinterrupt in topic-extract --- .../entrypoint/database/topic_extract.py | 105 +++++++++--------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index 5e5ce9025..4ef6d26e3 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -164,62 +164,65 @@ def run( article_source = ArticleSource(source) all_results: list[dict[str, Any]] = [] - if article_source is ArticleSource.PMC: - if mesh_topic_db is None: - logger.error("The option --mesh-topics-db is mandatory for source type pmc") - return 1 - mesh_tree = mesh.MeSHTree.load(mesh_topic_db) - for path in inputs: - logger.info(f"Processing {path}") - topic_info = TopicInfo(source=article_source, path=path.resolve()) - journal_topics = get_topics_for_pmc_article(path) - if journal_topics: - topic_info.add_journal_topics( - "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) - ) - all_results.append(topic_info.json()) - elif article_source is ArticleSource.PUBMED: - if mesh_topic_db is None: - logger.error( - "The option --mesh-topics-db is mandatory for source type pubmed" - ) - return 1 - mesh_tree = mesh.MeSHTree.load(mesh_topic_db) - for path in inputs: - logger.info(f"Processing {path}") - articles = ElementTree.parse(input_path) - for i, article in enumerate(articles.iter("PubmedArticle")): - topic_info = TopicInfo( - source=article_source, - path=path.resolve(), - element_in_file=i, - ) - article_topics = extract_article_topics_for_pubmed_article(article) - journal_topics = extract_journal_topics_for_pubmed_article(article) - if article_topics: - topic_info.add_article_topics( - "MeSH", mesh.resolve_parents(article_topics, mesh_tree) - ) + try: + if article_source is ArticleSource.PMC: + if mesh_topic_db is None: + logger.error("The option --mesh-topics-db is mandatory for source type pmc") + return 1 + mesh_tree = mesh.MeSHTree.load(mesh_topic_db) + for path in inputs: + logger.info(f"Processing {path}") + topic_info = TopicInfo(source=article_source, path=path.resolve()) + journal_topics = get_topics_for_pmc_article(path) if journal_topics: topic_info.add_journal_topics( "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) ) all_results.append(topic_info.json()) - elif article_source is ArticleSource.ARXIV: - for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): - topic_info = TopicInfo(source=article_source, path=path) - topic_info.add_article_topics("arXiv", article_topics) - all_results.append(topic_info.json()) - elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: - for path in inputs: - logger.info(f"Processing {path}") - topic, journal = extract_article_topics_from_medrxiv_article(path) - topic_info = TopicInfo(source=ArticleSource(journal), path=path) - topic_info.add_article_topics("Subject Area", [topic]) - all_results.append(topic_info.json()) - else: - logger.error(f"The source type {source!r} is not implemented yet") - return 1 + elif article_source is ArticleSource.PUBMED: + if mesh_topic_db is None: + logger.error( + "The option --mesh-topics-db is mandatory for source type pubmed" + ) + return 1 + mesh_tree = mesh.MeSHTree.load(mesh_topic_db) + for path in inputs: + logger.info(f"Processing {path}") + articles = ElementTree.parse(input_path) + for i, article in enumerate(articles.iter("PubmedArticle")): + topic_info = TopicInfo( + source=article_source, + path=path.resolve(), + element_in_file=i, + ) + article_topics = extract_article_topics_for_pubmed_article(article) + journal_topics = extract_journal_topics_for_pubmed_article(article) + if article_topics: + topic_info.add_article_topics( + "MeSH", mesh.resolve_parents(article_topics, mesh_tree) + ) + if journal_topics: + topic_info.add_journal_topics( + "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) + ) + all_results.append(topic_info.json()) + elif article_source is ArticleSource.ARXIV: + for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): + topic_info = TopicInfo(source=article_source, path=path) + topic_info.add_article_topics("arXiv", article_topics) + all_results.append(topic_info.json()) + elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: + for path in inputs: + logger.info(f"Processing {path}") + topic, journal = extract_article_topics_from_medrxiv_article(path) + topic_info = TopicInfo(source=ArticleSource(journal), path=path) + topic_info.add_article_topics("Subject Area", [topic]) + all_results.append(topic_info.json()) + else: + logger.error(f"The source type {source!r} is not implemented yet") + return 1 + except KeyboardInterrupt: + pass JSONL.dump_jsonl(all_results, output_file, overwrite) From 4da3450e61422d09ed304c0bec0f931bcf45e793 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 11:05:57 +0100 Subject: [PATCH 16/78] Timeout experiments Very unsuccessful --- src/bluesearch/entrypoint/database/run.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 6e47df0a4..24299eef0 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -142,7 +142,8 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: return parser -BBS_BINARY = "bbs_database" +BBS_BINARY = ["gtimeout", "--preserve-status", "5" , "bbs_database"] +BBS_BINARY = ["bbs_database"] CAPTURE_OUTPUT = False class DownloadTask(ExternalProgramTask): @@ -163,7 +164,7 @@ def output(self): def program_args(self): output_dir = self.output().path return [ - BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, + *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, ] @@ -242,7 +243,7 @@ def program_args(self): output_dir = self.output().path command = [ - BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, + *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, ] if self.source in {"pmc", "pubmed"}: @@ -265,7 +266,7 @@ def program_args(self): output_file = self.output().path command = [ - BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, + *BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, ] return command @@ -314,7 +315,7 @@ def program_args(self): output_dir = self.output().path command = [ - BBS_BINARY, + *BBS_BINARY, "convert-pdf", "-v", self.grobid_host, @@ -365,7 +366,7 @@ def program_args(self): parser = source2parser[self.source] command = [ - BBS_BINARY, + *BBS_BINARY, "parse", "-v", parser, @@ -425,13 +426,8 @@ def program_args(self): return command - -@requires(AddTask) -class ListTask(ExternalProgramTask): - capture_output = False - def program_args(self): - return ["ls", "-alh", "luigi/temp/"] - +class worker(luigi.Config): + timeout = luigi.IntParameter(5) def run( *, @@ -468,6 +464,7 @@ def run( db_type=db_type, ) + luigi_kwargs = { "tasks": [final_task], "log_level": "DEBUG", From 8ffbb61773e1a6e2e7452847abd15d144f50086b Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 11:51:37 +0100 Subject: [PATCH 17/78] Remove keyboardinterrupt catching --- src/bluesearch/entrypoint/database/run.py | 2 +- .../entrypoint/database/topic_extract.py | 105 +++++++++--------- 2 files changed, 52 insertions(+), 55 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 24299eef0..59669e67c 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -415,7 +415,7 @@ def program_args(self): command = [ - BBS_BINARY, + *BBS_BINARY, "add", self.db_url, input_dir, diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index 4ef6d26e3..5e5ce9025 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -164,65 +164,62 @@ def run( article_source = ArticleSource(source) all_results: list[dict[str, Any]] = [] - try: - if article_source is ArticleSource.PMC: - if mesh_topic_db is None: - logger.error("The option --mesh-topics-db is mandatory for source type pmc") - return 1 - mesh_tree = mesh.MeSHTree.load(mesh_topic_db) - for path in inputs: - logger.info(f"Processing {path}") - topic_info = TopicInfo(source=article_source, path=path.resolve()) - journal_topics = get_topics_for_pmc_article(path) + if article_source is ArticleSource.PMC: + if mesh_topic_db is None: + logger.error("The option --mesh-topics-db is mandatory for source type pmc") + return 1 + mesh_tree = mesh.MeSHTree.load(mesh_topic_db) + for path in inputs: + logger.info(f"Processing {path}") + topic_info = TopicInfo(source=article_source, path=path.resolve()) + journal_topics = get_topics_for_pmc_article(path) + if journal_topics: + topic_info.add_journal_topics( + "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) + ) + all_results.append(topic_info.json()) + elif article_source is ArticleSource.PUBMED: + if mesh_topic_db is None: + logger.error( + "The option --mesh-topics-db is mandatory for source type pubmed" + ) + return 1 + mesh_tree = mesh.MeSHTree.load(mesh_topic_db) + for path in inputs: + logger.info(f"Processing {path}") + articles = ElementTree.parse(input_path) + for i, article in enumerate(articles.iter("PubmedArticle")): + topic_info = TopicInfo( + source=article_source, + path=path.resolve(), + element_in_file=i, + ) + article_topics = extract_article_topics_for_pubmed_article(article) + journal_topics = extract_journal_topics_for_pubmed_article(article) + if article_topics: + topic_info.add_article_topics( + "MeSH", mesh.resolve_parents(article_topics, mesh_tree) + ) if journal_topics: topic_info.add_journal_topics( "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) ) all_results.append(topic_info.json()) - elif article_source is ArticleSource.PUBMED: - if mesh_topic_db is None: - logger.error( - "The option --mesh-topics-db is mandatory for source type pubmed" - ) - return 1 - mesh_tree = mesh.MeSHTree.load(mesh_topic_db) - for path in inputs: - logger.info(f"Processing {path}") - articles = ElementTree.parse(input_path) - for i, article in enumerate(articles.iter("PubmedArticle")): - topic_info = TopicInfo( - source=article_source, - path=path.resolve(), - element_in_file=i, - ) - article_topics = extract_article_topics_for_pubmed_article(article) - journal_topics = extract_journal_topics_for_pubmed_article(article) - if article_topics: - topic_info.add_article_topics( - "MeSH", mesh.resolve_parents(article_topics, mesh_tree) - ) - if journal_topics: - topic_info.add_journal_topics( - "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) - ) - all_results.append(topic_info.json()) - elif article_source is ArticleSource.ARXIV: - for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): - topic_info = TopicInfo(source=article_source, path=path) - topic_info.add_article_topics("arXiv", article_topics) - all_results.append(topic_info.json()) - elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: - for path in inputs: - logger.info(f"Processing {path}") - topic, journal = extract_article_topics_from_medrxiv_article(path) - topic_info = TopicInfo(source=ArticleSource(journal), path=path) - topic_info.add_article_topics("Subject Area", [topic]) - all_results.append(topic_info.json()) - else: - logger.error(f"The source type {source!r} is not implemented yet") - return 1 - except KeyboardInterrupt: - pass + elif article_source is ArticleSource.ARXIV: + for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): + topic_info = TopicInfo(source=article_source, path=path) + topic_info.add_article_topics("arXiv", article_topics) + all_results.append(topic_info.json()) + elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: + for path in inputs: + logger.info(f"Processing {path}") + topic, journal = extract_article_topics_from_medrxiv_article(path) + topic_info = TopicInfo(source=ArticleSource(journal), path=path) + topic_info.add_article_topics("Subject Area", [topic]) + all_results.append(topic_info.json()) + else: + logger.error(f"The source type {source!r} is not implemented yet") + return 1 JSONL.dump_jsonl(all_results, output_file, overwrite) From 5b647678e347752b0ff25671a83fd15f9d363e7b Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 12:02:50 +0100 Subject: [PATCH 18/78] Fix typo and wrong task dependency --- src/bluesearch/entrypoint/database/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 59669e67c..fbbac07d0 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -338,7 +338,7 @@ def requires(self): if self.source == "arxiv": return self.clone(ConvertPDFTask) else: - return self.clone(TopicFilterTask) + return self.clone(CreateSymlinksTask) def output(self): output_file = Path(self.input().path).parent / "parsed" @@ -358,9 +358,9 @@ def program_args(self): # Determine parser source2parser = { "arxiv": "tei-xml-arxiv", - "biorxiv": "jatx-xml", - "medrxiv": "jatx-xml", - "pmc": "jatx-xml", + "biorxiv": "jats-xml", + "medrxiv": "jats-xml", + "pmc": "jats-xml", "pubmed": "pubmed-xml", } parser = source2parser[self.source] From b7ef2caac6fb91fb980f4f201dd0e0bee6c41e03 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 15:37:59 +0100 Subject: [PATCH 19/78] Add small changes --- src/bluesearch/entrypoint/database/run.py | 9 +++++++-- src/bluesearch/entrypoint/database/topic_extract.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index fbbac07d0..43b25963b 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -245,6 +245,11 @@ def program_args(self): command = [ *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, ] + + if self.source in {"medrxiv", "biorxiv"}: + command.extend( + ["-R", "-m", r".*\.meca$"], + ) if self.source in {"pmc", "pubmed"}: command.append(f"--mesh-topic-db={self.mesh_topic_db}") @@ -290,7 +295,7 @@ def run(self): input_dir = output_dir.parent / "raw" filtering = pd.read_csv(filtering_path) - accepted = filtering[filtering.accept].path + accepted = pd.Series(filtering[filtering.accept].path.unique()) def create_symlink(path): input_path = Path(path) @@ -361,7 +366,7 @@ def program_args(self): "biorxiv": "jats-xml", "medrxiv": "jats-xml", "pmc": "jats-xml", - "pubmed": "pubmed-xml", + "pubmed": "pubmed-xml-set", } parser = source2parser[self.source] diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index 5e5ce9025..4a0590b68 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -187,7 +187,7 @@ def run( mesh_tree = mesh.MeSHTree.load(mesh_topic_db) for path in inputs: logger.info(f"Processing {path}") - articles = ElementTree.parse(input_path) + articles = ElementTree.parse(path) for i, article in enumerate(articles.iter("PubmedArticle")): topic_info = TopicInfo( source=article_source, @@ -214,6 +214,7 @@ def run( for path in inputs: logger.info(f"Processing {path}") topic, journal = extract_article_topics_from_medrxiv_article(path) + journal = journal.lower() topic_info = TopicInfo(source=ArticleSource(journal), path=path) topic_info.add_article_topics("Subject Area", [topic]) all_results.append(topic_info.json()) From ccd88e02bc039ffdef3e3d297739136a6a867fda Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 16:36:11 +0100 Subject: [PATCH 20/78] Add some docstrings and annotations --- src/bluesearch/entrypoint/database/run.py | 39 ++++++++++++++++------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 43b25963b..99681cd52 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -147,12 +147,17 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: CAPTURE_OUTPUT = False class DownloadTask(ExternalProgramTask): + """Download raw files. + + They will be stored in the `raw/` folder. + """ source = luigi.Parameter() from_month = luigi.Parameter() output_dir = luigi.Parameter() - def output(self): + def output(self) -> luigi.LocalTarget: + """Define download folder.""" today = datetime.today() date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" @@ -161,7 +166,8 @@ def output(self): return luigi.LocalTarget(str(output_dir)) - def program_args(self): + def program_args(self) -> list[str]: + """Define subprocess arguments.""" output_dir = self.output().path return [ *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, @@ -171,17 +177,23 @@ def program_args(self): @requires(DownloadTask) class UnzipTask(ExternalProgramTask): - """Needs to support unziping of both pubmed and pmc.""" + """Unzip raw files (if necessary). + + Only applicable in case of `pubmed` and `pmc`. The unzipped files + are stored inside of `raw_unzipped`. + """ source = luigi.Parameter() - def output(self): + def output(self) -> luigi.LocalTarget: + """Define unzipping folder.""" input_path = Path(self.input().path) output_dir = input_path.parent / "raw_unzipped" return luigi.LocalTarget(str(output_dir)) - def run(self): + def run(self) -> None: + """Unzip.""" input_dir = Path(self.input().path) # raw output_dir = Path(self.output().path) # raw_unzipped @@ -222,16 +234,24 @@ def run(self): @inherits(DownloadTask, UnzipTask) class TopicExtractTask(ExternalProgramTask): + """Topic extraction. + + The input of this dask is either `raw/` or `raw_unzipped/` depending + on the source. The output is going to be a single file + `topic_infos.jsonl`. + """ source = luigi.Parameter() mesh_topic_db = luigi.Parameter() - def requires(self): + def requires(self) -> luigi.Task: + """Define conditional dependencies.""" if self.source in {"pmc", "pubmed"}: return self.clone(UnzipTask) else: return self.clone(DownloadTask) - def output(self): + def output(self) -> luigi.LocalTarget: + """Define output file path.""" input_dir = self.input() output_file = Path(input_dir.path).parent / "topic_infos.jsonl" @@ -239,6 +259,7 @@ def output(self): def program_args(self): + """Define subprocess arguments.""" input_dir = self.input().path output_dir = self.output().path @@ -430,10 +451,6 @@ def program_args(self): return command - -class worker(luigi.Config): - timeout = luigi.IntParameter(5) - def run( *, source: str, From 5ad1a75add482c7e19293212d7fa6f94da0b4077 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 16:41:22 +0100 Subject: [PATCH 21/78] Fix the unit test --- tests/unit/entrypoint/database/test_run.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 06ba52a22..1c6229603 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -31,17 +31,32 @@ "source", "from_month", "filter_config", + "output_dir", + "db_url", + "db_type", + "mesh_topic_db", + "dry_run", + "grobid_host", + "grobid_port", } def test_init_parser(): parser = run.init_parser(argparse.ArgumentParser()) - args = parser.parse_args(["arxiv", "2021-12", "/path/to/config.jsonl"]) + args = parser.parse_args( + [ + "--source=arxiv", + "--from-month=2021-12", + "--filter-config=/path/to/config.jsonl", + "--output-dir=some/output/dir", + "--db-url=some.url" + ] + ) assert vars(args).keys() == RUN_PARAMS # Test the values assert args.source == "arxiv" - assert args.from_month == datetime.datetime(2021, 12, 1) + assert args.from_month == "2021-12" assert args.filter_config == pathlib.Path("/path/to/config.jsonl") From d186281159136790de3c9ca2576aa87d0deff7be Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 16:56:29 +0100 Subject: [PATCH 22/78] Write additional unit test --- src/bluesearch/entrypoint/database/run.py | 2 +- tests/unit/entrypoint/database/test_run.py | 92 ++++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 99681cd52..ea162a941 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -333,7 +333,7 @@ def create_symlink(path): @requires(CreateSymlinksTask) class ConvertPDFTask(ExternalProgramTask): grobid_host = luigi.Parameter() - grobid_port = luigi.Parameter() + grobid_port = luigi.IntParameter() def program_args(self): diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 1c6229603..842d58575 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -62,3 +62,95 @@ def test_init_parser(): def test_run_arguments(): assert inspect.signature(run.run).parameters.keys() == RUN_PARAMS + + +@pytest.mark.parametrize( + "source,tasks", + [ + ( + "arxiv", + ( + "DownloadTask", + "TopicExtractTask", + "TopicFilterTask", + "CreateSymlinksTask", + "ConvertPDFTask", + "ParseTask", + "AddTask", + + ) + ), + ( + "biorxiv", + ( + "DownloadTask", + "TopicExtractTask", + "TopicFilterTask", + "CreateSymlinksTask", + "ParseTask", + "AddTask", + + ) + ), + ( + "medrxiv", + ( + "DownloadTask", + "TopicExtractTask", + "TopicFilterTask", + "CreateSymlinksTask", + "ParseTask", + "AddTask", + + ) + ), + ( + "pmc", + ( + "DownloadTask", + "UnzipTask", + "TopicExtractTask", + "TopicFilterTask", + "CreateSymlinksTask", + "ParseTask", + "AddTask", + + ) + ), + ( + "pubmed", + ( + "DownloadTask", + "UnzipTask", + "TopicExtractTask", + "TopicFilterTask", + "CreateSymlinksTask", + "ParseTask", + "AddTask", + + ) + ), + + + ] +) +def test_pipelines(source, tasks, tmp_path, capsys): + run.run( + source=source, + from_month="whatever", + filter_config="whatever", + output_dir=tmp_path, + dry_run=True, + mesh_topic_db="whatever", + grobid_host="whatever", + grobid_port=1234, + db_url="whatever", + db_type="sqlite", + ) + + captured = capsys.readouterr() + stdout_lines = reversed(captured.out.splitlines()[1:]) + + for stdout_line, task in zip(stdout_lines, tasks): + assert task in stdout_line + From 91458e485ef8366837ca1fd31634a11251804f33 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 16:56:54 +0100 Subject: [PATCH 23/78] Make black happy --- src/bluesearch/entrypoint/database/run.py | 71 ++++++++++++---------- tests/unit/entrypoint/database/test_run.py | 25 +++----- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index ea162a941..22b16bc2c 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -142,20 +142,21 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: return parser -BBS_BINARY = ["gtimeout", "--preserve-status", "5" , "bbs_database"] +BBS_BINARY = ["gtimeout", "--preserve-status", "5", "bbs_database"] BBS_BINARY = ["bbs_database"] CAPTURE_OUTPUT = False + class DownloadTask(ExternalProgramTask): """Download raw files. They will be stored in the `raw/` folder. """ + source = luigi.Parameter() from_month = luigi.Parameter() output_dir = luigi.Parameter() - def output(self) -> luigi.LocalTarget: """Define download folder.""" today = datetime.today() @@ -165,16 +166,19 @@ def output(self) -> luigi.LocalTarget: return luigi.LocalTarget(str(output_dir)) - def program_args(self) -> list[str]: """Define subprocess arguments.""" output_dir = self.output().path return [ - *BBS_BINARY, "download", "-v", self.source, self.from_month, output_dir, + *BBS_BINARY, + "download", + "-v", + self.source, + self.from_month, + output_dir, ] - @requires(DownloadTask) class UnzipTask(ExternalProgramTask): """Unzip raw files (if necessary). @@ -182,8 +186,8 @@ class UnzipTask(ExternalProgramTask): Only applicable in case of `pubmed` and `pmc`. The unzipped files are stored inside of `raw_unzipped`. """ - source = luigi.Parameter() + source = luigi.Parameter() def output(self) -> luigi.LocalTarget: """Define unzipping folder.""" @@ -194,10 +198,9 @@ def output(self) -> luigi.LocalTarget: def run(self) -> None: """Unzip.""" - input_dir = Path(self.input().path) # raw + input_dir = Path(self.input().path) # raw output_dir = Path(self.output().path) # raw_unzipped - output_dir.mkdir(exist_ok=True, parents=True) if self.source == "pmc": # .tar.gz @@ -223,15 +226,13 @@ def run(self) -> None: for archive in all_zip_files: output_path = output_dir / archive.stem with gzip.open(archive, "rb") as f_in: - with open(output_path,"wb") as f_out: + with open(output_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) else: raise ValueError(f"Unsupported source {self.source}") - - @inherits(DownloadTask, UnzipTask) class TopicExtractTask(ExternalProgramTask): """Topic extraction. @@ -240,6 +241,7 @@ class TopicExtractTask(ExternalProgramTask): on the source. The output is going to be a single file `topic_infos.jsonl`. """ + source = luigi.Parameter() mesh_topic_db = luigi.Parameter() @@ -257,21 +259,25 @@ def output(self) -> luigi.LocalTarget: return luigi.LocalTarget(str(output_file)) - def program_args(self): """Define subprocess arguments.""" input_dir = self.input().path output_dir = self.output().path command = [ - *BBS_BINARY, "topic-extract", "-v", self.source, input_dir, output_dir, + *BBS_BINARY, + "topic-extract", + "-v", + self.source, + input_dir, + output_dir, ] if self.source in {"medrxiv", "biorxiv"}: command.extend( ["-R", "-m", r".*\.meca$"], ) - + if self.source in {"pmc", "pubmed"}: command.append(f"--mesh-topic-db={self.mesh_topic_db}") @@ -292,9 +298,14 @@ def program_args(self): output_file = self.output().path command = [ - *BBS_BINARY, "topic-filter", "-v", extracted_topics, self.filter_config, output_file, + *BBS_BINARY, + "topic-filter", + "-v", + extracted_topics, + self.filter_config, + output_file, ] - + return command @@ -308,7 +319,7 @@ def output(self): def run(self): output_dir = Path(self.output().path) filtering_path = Path(self.input().path) - input_dir = output_dir.parent / "raw_unzipped" + input_dir = output_dir.parent / "raw_unzipped" if (output_dir.parent / "raw_unzipped").exists(): input_dir = output_dir.parent / "raw_unzipped" @@ -328,14 +339,11 @@ def create_symlink(path): accepted.apply(create_symlink) - - @requires(CreateSymlinksTask) class ConvertPDFTask(ExternalProgramTask): grobid_host = luigi.Parameter() grobid_port = luigi.IntParameter() - def program_args(self): input_dir = Path(self.input().path).parent / "filtered" output_dir = self.output().path @@ -345,11 +353,11 @@ def program_args(self): "convert-pdf", "-v", self.grobid_host, - self.grobid_port, + self.grobid_port, input_dir, f"--output-dir={output_dir}", ] - + return command def output(self): @@ -375,7 +383,6 @@ def program_args(self): output_dir = Path(self.output().path) output_dir.mkdir(exist_ok=True) - if (output_dir.parent / "converted_pdfs").exists(): input_dir = output_dir.parent / "converted_pdfs" else: @@ -396,10 +403,10 @@ def program_args(self): "parse", "-v", parser, - input_dir, + input_dir, output_dir, ] - + return command @@ -423,7 +430,9 @@ def complete(self): if not input_dir.exists(): return False - all_uids = [article.stem for article in input_dir.iterdir() if article.suffix == ".json"] + all_uids = [ + article.stem for article in input_dir.iterdir() if article.suffix == ".json" + ] new_uids = [] for uid in all_uids: @@ -435,11 +444,9 @@ def complete(self): return not new_uids - def program_args(self): input_dir = Path(self.input().path) - command = [ *BBS_BINARY, "add", @@ -448,9 +455,10 @@ def program_args(self): "-v", f"--db-type={self.db_type}", ] - + return command + def run( *, source: str, @@ -471,8 +479,8 @@ def run( """ logger.info("Starting the overall pipeline") - DownloadTask.capture_output = CAPTURE_OUTPUT - TopicExtractTask.capture_output = CAPTURE_OUTPUT + DownloadTask.capture_output = CAPTURE_OUTPUT + TopicExtractTask.capture_output = CAPTURE_OUTPUT final_task = AddTask( source=source, @@ -486,7 +494,6 @@ def run( db_type=db_type, ) - luigi_kwargs = { "tasks": [final_task], "log_level": "DEBUG", diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 842d58575..162eac5c4 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -40,6 +40,7 @@ "grobid_port", } + def test_init_parser(): parser = run.init_parser(argparse.ArgumentParser()) @@ -49,7 +50,7 @@ def test_init_parser(): "--from-month=2021-12", "--filter-config=/path/to/config.jsonl", "--output-dir=some/output/dir", - "--db-url=some.url" + "--db-url=some.url", ] ) assert vars(args).keys() == RUN_PARAMS @@ -65,7 +66,7 @@ def test_run_arguments(): @pytest.mark.parametrize( - "source,tasks", + "source,tasks", [ ( "arxiv", @@ -77,8 +78,7 @@ def test_run_arguments(): "ConvertPDFTask", "ParseTask", "AddTask", - - ) + ), ), ( "biorxiv", @@ -89,8 +89,7 @@ def test_run_arguments(): "CreateSymlinksTask", "ParseTask", "AddTask", - - ) + ), ), ( "medrxiv", @@ -101,8 +100,7 @@ def test_run_arguments(): "CreateSymlinksTask", "ParseTask", "AddTask", - - ) + ), ), ( "pmc", @@ -114,8 +112,7 @@ def test_run_arguments(): "CreateSymlinksTask", "ParseTask", "AddTask", - - ) + ), ), ( "pubmed", @@ -127,12 +124,9 @@ def test_run_arguments(): "CreateSymlinksTask", "ParseTask", "AddTask", - - ) + ), ), - - - ] + ], ) def test_pipelines(source, tasks, tmp_path, capsys): run.run( @@ -153,4 +147,3 @@ def test_pipelines(source, tasks, tmp_path, capsys): for stdout_line, task in zip(stdout_lines, tasks): assert task in stdout_line - From 7387257ea60e4c4cd538a5aeede88417fbae996e Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 17:02:02 +0100 Subject: [PATCH 24/78] ADd pending to the check --- tests/unit/entrypoint/database/test_run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 162eac5c4..05bf85e01 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -147,3 +147,4 @@ def test_pipelines(source, tasks, tmp_path, capsys): for stdout_line, task in zip(stdout_lines, tasks): assert task in stdout_line + assert "PENDING" in stdout_line From 3a49d3e72a94b8ce0f7c90d8a69560f4e6d1e84b Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 14 Feb 2022 17:41:50 +0100 Subject: [PATCH 25/78] Configure output capturing --- src/bluesearch/entrypoint/database/add.py | 1 + src/bluesearch/entrypoint/database/run.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/add.py b/src/bluesearch/entrypoint/database/add.py index 7055f2941..b9f9814b6 100644 --- a/src/bluesearch/entrypoint/database/add.py +++ b/src/bluesearch/entrypoint/database/add.py @@ -124,6 +124,7 @@ def run( sentence_mappings = [] for article in articles: + logger.info(f"Processing {article.uid}") article_mapping = { "article_id": article.uid, diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 22b16bc2c..b6bee27c3 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -480,7 +480,13 @@ def run( logger.info("Starting the overall pipeline") DownloadTask.capture_output = CAPTURE_OUTPUT + UnzipTask.capture_output = CAPTURE_OUTPUT TopicExtractTask.capture_output = CAPTURE_OUTPUT + TopicFilterTask.capture_output = CAPTURE_OUTPUT + CreateSymlinksTask.capture_output = CAPTURE_OUTPUT + ConvertPDFTask.capture_output = CAPTURE_OUTPUT + ParseTask.capture_output = CAPTURE_OUTPUT + AddTask.capture_output = CAPTURE_OUTPUT final_task = AddTask( source=source, @@ -497,7 +503,7 @@ def run( luigi_kwargs = { "tasks": [final_task], "log_level": "DEBUG", - "local_scheduler": True, + "local_scheduler": False, } if dry_run: print(print_tree(final_task, last=False)) From 4cc4208f2db88ca6ff47422abde5b55db6031225 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 10:53:11 +0100 Subject: [PATCH 26/78] Add local timeout hack Requires custom_timeout binary to be in the PATH --- src/bluesearch/entrypoint/database/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index b6bee27c3..a418b9a91 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -142,7 +142,6 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: return parser -BBS_BINARY = ["gtimeout", "--preserve-status", "5", "bbs_database"] BBS_BINARY = ["bbs_database"] CAPTURE_OUTPUT = False @@ -170,6 +169,7 @@ def program_args(self) -> list[str]: """Define subprocess arguments.""" output_dir = self.output().path return [ + "custom_timeout", *BBS_BINARY, "download", "-v", From 7b55f2e9115e2ec969306fb5b5f4a37ee2b09a74 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 10:55:47 +0100 Subject: [PATCH 27/78] Only use local-scheduler --- src/bluesearch/entrypoint/database/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index a418b9a91..f28601aaa 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -503,7 +503,7 @@ def run( luigi_kwargs = { "tasks": [final_task], "log_level": "DEBUG", - "local_scheduler": False, + "local_scheduler": True, } if dry_run: print(print_tree(final_task, last=False)) From 9f23d4c0ec8bdfff07b2193edba5a5811fde19a4 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 11:38:50 +0100 Subject: [PATCH 28/78] Turn entrypoint verbosity into global variable --- src/bluesearch/entrypoint/database/run.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index f28601aaa..cb48a5e64 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -143,6 +143,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: BBS_BINARY = ["bbs_database"] +VERBOSITY = ["-v"] # for the entrypoint subprocesses CAPTURE_OUTPUT = False @@ -172,7 +173,7 @@ def program_args(self) -> list[str]: "custom_timeout", *BBS_BINARY, "download", - "-v", + *VERBOSITY, self.source, self.from_month, output_dir, @@ -267,7 +268,7 @@ def program_args(self): command = [ *BBS_BINARY, "topic-extract", - "-v", + *VERBOSITY, self.source, input_dir, output_dir, @@ -300,7 +301,7 @@ def program_args(self): command = [ *BBS_BINARY, "topic-filter", - "-v", + *VERBOSITY, extracted_topics, self.filter_config, output_file, @@ -351,7 +352,7 @@ def program_args(self): command = [ *BBS_BINARY, "convert-pdf", - "-v", + *VERBOSITY, self.grobid_host, self.grobid_port, input_dir, @@ -401,7 +402,7 @@ def program_args(self): command = [ *BBS_BINARY, "parse", - "-v", + *VERBOSITY, parser, input_dir, output_dir, @@ -450,9 +451,9 @@ def program_args(self): command = [ *BBS_BINARY, "add", + *VERBOSITY, self.db_url, input_dir, - "-v", f"--db-type={self.db_type}", ] From 4982824b902447854e64d3631be9de16f663e570 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 12:18:03 +0100 Subject: [PATCH 29/78] Fix source2parse and also postgres complete check --- src/bluesearch/entrypoint/database/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index cb48a5e64..d8b3cbfd6 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -392,8 +392,8 @@ def program_args(self): # Determine parser source2parser = { "arxiv": "tei-xml-arxiv", - "biorxiv": "jats-xml", - "medrxiv": "jats-xml", + "biorxiv": "jats-meca", + "medrxiv": "jats-meca", "pmc": "jats-xml", "pubmed": "pubmed-xml-set", } @@ -437,8 +437,8 @@ def complete(self): new_uids = [] for uid in all_uids: - query = "SELECT article_id from articles WHERE article_id = ?" - res = engine.execute(query, (uid,)).fetchall() + query = sqlalchemy.text("SELECT article_id from articles WHERE article_id = :uid") + res = engine.execute(query, uid=uid).fetchall() if not res: new_uids.append(uid) From af47a56f7bf4366e24092b5bf286a35bba6eccf0 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 14:21:19 +0100 Subject: [PATCH 30/78] Add luigi to requirements --- requirements.txt | 1 + setup.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index effce482e..7cfc95be1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ ipython==7.31.1 ipywidgets==7.6.3 jupyterlab==3.0.17 langdetect==1.0.9 +luigi==3.0.3 mashumaro==3.0 numpy==1.21.0 pandas==1.3.0 diff --git a/setup.py b/setup.py index 5a2133dd8..4058d79bd 100644 --- a/setup.py +++ b/setup.py @@ -60,10 +60,11 @@ "ipywidgets", "jupyterlab>=3", "langdetect", - "numpy>=1.20.1", - "pandas>=1", + "luigi", # Serialization framework on top of dataclasses, e.g. 'Article' to and from JSON. "mashumaro>=3.0", + "numpy>=1.20.1", + "pandas>=1", "pg8000", "python-dotenv", "requests", From d937dd11595bf40a533b34d5458ddbda84d0fd03 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 14:29:16 +0100 Subject: [PATCH 31/78] Run black --- src/bluesearch/entrypoint/database/run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index d8b3cbfd6..53ba06acf 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -437,7 +437,9 @@ def complete(self): new_uids = [] for uid in all_uids: - query = sqlalchemy.text("SELECT article_id from articles WHERE article_id = :uid") + query = sqlalchemy.text( + "SELECT article_id from articles WHERE article_id = :uid" + ) res = engine.execute(query, uid=uid).fetchall() if not res: From 862358819e76a7435574d68ff43863448ad6d6bb Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 14:36:39 +0100 Subject: [PATCH 32/78] Correct flake8 mistakes --- src/bluesearch/entrypoint/database/run.py | 9 --------- tests/unit/entrypoint/database/test_run.py | 4 ---- 2 files changed, 13 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 53ba06acf..31abbf905 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -19,14 +19,11 @@ import argparse import gzip -import json import logging import shutil import tarfile -import warnings from datetime import datetime from pathlib import Path -from typing import Iterator import luigi import pandas as pd @@ -320,12 +317,6 @@ def output(self): def run(self): output_dir = Path(self.output().path) filtering_path = Path(self.input().path) - input_dir = output_dir.parent / "raw_unzipped" - - if (output_dir.parent / "raw_unzipped").exists(): - input_dir = output_dir.parent / "raw_unzipped" - else: - input_dir = output_dir.parent / "raw" filtering = pd.read_csv(filtering_path) accepted = pd.Series(filtering[filtering.accept].path.unique()) diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 05bf85e01..5694ba232 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -16,16 +16,12 @@ # along with this program. If not, see . import argparse -import datetime import inspect import pathlib -import numpy as np -import pandas as pd import pytest from bluesearch.entrypoint.database import run -from bluesearch.utils import JSONL RUN_PARAMS = { "source", From db5768dcd700a3f2f4b1730c6c37b16a7226cf86 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 14:38:57 +0100 Subject: [PATCH 33/78] Fix isort problems --- src/bluesearch/entrypoint/database/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 31abbf905..d0ea16e7f 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -25,14 +25,14 @@ from datetime import datetime from pathlib import Path -import luigi import pandas as pd import sqlalchemy -from luigi.util import inherits, requires -from luigi.contrib.external_program import ExternalProgramTask -from luigi.tools.deps_tree import print_tree +import luigi from bluesearch.database.article import ArticleSource +from luigi.contrib.external_program import ExternalProgramTask +from luigi.tools.deps_tree import print_tree +from luigi.util import inherits, requires logger = logging.getLogger(__name__) From 9b19dc605ed58ceb5b4bd29c566c1becf915f08f Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 14:48:26 +0100 Subject: [PATCH 34/78] Fix typing --- src/bluesearch/entrypoint/database/run.py | 8 ++++---- tests/unit/entrypoint/database/test_run.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index d0ea16e7f..7de7a6c34 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -212,7 +212,7 @@ def run(self) -> None: output_path = output_dir / article.path.rpartition("/")[2] f_in = my_tar.extractfile(article) with open(output_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) + shutil.copyfileobj(f_in, f_out) # type: ignore my_tar.close() elif self.source == "pubmed": @@ -223,9 +223,9 @@ def run(self) -> None: for archive in all_zip_files: output_path = output_dir / archive.stem - with gzip.open(archive, "rb") as f_in: - with open(output_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) + with gzip.open(archive, "rb") as f_in_2: + with open(output_path, "wb") as f_out_2: + shutil.copyfileobj(f_in_2, f_out_2) else: raise ValueError(f"Unsupported source {self.source}") diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 5694ba232..402563d6e 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -128,10 +128,10 @@ def test_pipelines(source, tasks, tmp_path, capsys): run.run( source=source, from_month="whatever", - filter_config="whatever", + filter_config=pathlib.Path("whatever"), output_dir=tmp_path, dry_run=True, - mesh_topic_db="whatever", + mesh_topic_db=pathlib.Path("whatever"), grobid_host="whatever", grobid_port=1234, db_url="whatever", From 03b433badb81e55bcf0fae3d78325bfec4cc7f30 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 15:02:23 +0100 Subject: [PATCH 35/78] Add more docstrings --- src/bluesearch/entrypoint/database/run.py | 40 +++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 7de7a6c34..48b1e9d26 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -284,14 +284,22 @@ def program_args(self): @requires(TopicExtractTask) class TopicFilterTask(ExternalProgramTask): + """Run topic filtering entrypoint. + + It inputs `topic_infos.jsonl` and `filter_config` (rules) and it + generates a file `filtering.csv`. + """ + filter_config = luigi.Parameter() def output(self): + """Define output file.""" output_file = Path(self.input().path).parent / "filtering.csv" return luigi.LocalTarget(str(output_file)) def program_args(self): + """Define subprocess arguments.""" extracted_topics = self.input().path output_file = self.output().path @@ -309,16 +317,22 @@ def program_args(self): @requires(TopicFilterTask) class CreateSymlinksTask(luigi.Task): + """Create folder of symlinked articles. + + We only symlink those articles that made it through the topic-filtering + stage. The only input is the `filtering.csv`. + """ def output(self): + """Define output folder.""" output_dir = Path(self.input().path).parent / "filtered" return luigi.LocalTarget(str(output_dir)) def run(self): + """Create symlinks.""" output_dir = Path(self.output().path) - filtering_path = Path(self.input().path) - filtering = pd.read_csv(filtering_path) + filtering = pd.read_csv(self.input()) accepted = pd.Series(filtering[filtering.accept].path.unique()) def create_symlink(path): @@ -333,10 +347,16 @@ def create_symlink(path): @requires(CreateSymlinksTask) class ConvertPDFTask(ExternalProgramTask): + """Convert PDFs to XMLs. + + Assumes that there is a GROBID server up and running. Only necessary + when `source=arxiv`. The output is the folder `converted_pdfs/`. + """ grobid_host = luigi.Parameter() grobid_port = luigi.IntParameter() def program_args(self): + """Define subprocess arguments.""" input_dir = Path(self.input().path).parent / "filtered" output_dir = self.output().path @@ -353,6 +373,7 @@ def program_args(self): return command def output(self): + """Define output folder.""" output_file = Path(self.input().path).parent / "converted_pdfs" return luigi.LocalTarget(str(output_file)) @@ -360,18 +381,26 @@ def output(self): @inherits(ConvertPDFTask, CreateSymlinksTask) class ParseTask(ExternalProgramTask): + """Parse articles. + + The input is all the articles inside of `filtered/` (or in case of + `source="arxiv"` `converted_pdfs/`. + """ def requires(self): + """Define conditional dependencies.""" if self.source == "arxiv": return self.clone(ConvertPDFTask) else: return self.clone(CreateSymlinksTask) def output(self): + """Define output folder.""" output_file = Path(self.input().path).parent / "parsed" return luigi.LocalTarget(str(output_file)) def program_args(self): + """Define subprocess arguments.""" output_dir = Path(self.output().path) output_dir.mkdir(exist_ok=True) @@ -404,10 +433,16 @@ def program_args(self): @requires(ParseTask) class AddTask(ExternalProgramTask): + """Add parsed articles to the database. + + This step is considered done if all articles inside of `parsed/` are + already in the database. + """ db_url = luigi.Parameter() db_type = luigi.Parameter() def complete(self): + """Check if all articles inside of `parsed/` are in the database.""" # If all the articles are inside if self.db_type == "sqlite": prefix = "sqlite:///" @@ -439,6 +474,7 @@ def complete(self): return not new_uids def program_args(self): + """Define subprocess arguments.""" input_dir = Path(self.input().path) command = [ From 91c0c71e1006df204d6597d73a3d7d0c627818a2 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 15:02:55 +0100 Subject: [PATCH 36/78] Rerun formatting --- src/bluesearch/entrypoint/database/run.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 48b1e9d26..2542600a3 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -322,6 +322,7 @@ class CreateSymlinksTask(luigi.Task): We only symlink those articles that made it through the topic-filtering stage. The only input is the `filtering.csv`. """ + def output(self): """Define output folder.""" output_dir = Path(self.input().path).parent / "filtered" @@ -352,6 +353,7 @@ class ConvertPDFTask(ExternalProgramTask): Assumes that there is a GROBID server up and running. Only necessary when `source=arxiv`. The output is the folder `converted_pdfs/`. """ + grobid_host = luigi.Parameter() grobid_port = luigi.IntParameter() @@ -386,6 +388,7 @@ class ParseTask(ExternalProgramTask): The input is all the articles inside of `filtered/` (or in case of `source="arxiv"` `converted_pdfs/`. """ + def requires(self): """Define conditional dependencies.""" if self.source == "arxiv": @@ -438,6 +441,7 @@ class AddTask(ExternalProgramTask): This step is considered done if all articles inside of `parsed/` are already in the database. """ + db_url = luigi.Parameter() db_type = luigi.Parameter() From b4d3d7bd93de1177c2ecb889abc14e43f6ab2b59 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 15:28:32 +0100 Subject: [PATCH 37/78] Nasty global variable date handling --- src/bluesearch/entrypoint/database/run.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 2542600a3..dafd52daa 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -142,6 +142,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: BBS_BINARY = ["bbs_database"] VERBOSITY = ["-v"] # for the entrypoint subprocesses CAPTURE_OUTPUT = False +OUTPUT_DIR_RAW = None # make sure the same datestamp for all tasks class DownloadTask(ExternalProgramTask): @@ -156,12 +157,14 @@ class DownloadTask(ExternalProgramTask): def output(self) -> luigi.LocalTarget: """Define download folder.""" - today = datetime.today() - date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" + global OUTPUT_DIR_RAW + if OUTPUT_DIR_RAW is None: + today = datetime.today() + date = f"{self.from_month}_{today.strftime('%Y-%m-%d:%M-%S')}" - output_dir = Path(self.output_dir) / self.source / date / "raw" + OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw" - return luigi.LocalTarget(str(output_dir)) + return luigi.LocalTarget(str(OUTPUT_DIR_RAW)) def program_args(self) -> list[str]: """Define subprocess arguments.""" @@ -333,7 +336,7 @@ def run(self): """Create symlinks.""" output_dir = Path(self.output().path) - filtering = pd.read_csv(self.input()) + filtering = pd.read_csv(self.input().path) accepted = pd.Series(filtering[filtering.accept].path.unique()) def create_symlink(path): From fd3f24dd6b3032fd28a370cabb06fb918a81c5a0 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 16:44:09 +0100 Subject: [PATCH 38/78] Dont consider minutes and seconds --- src/bluesearch/entrypoint/database/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index dafd52daa..d8e47861c 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -160,7 +160,7 @@ def output(self) -> luigi.LocalTarget: global OUTPUT_DIR_RAW if OUTPUT_DIR_RAW is None: today = datetime.today() - date = f"{self.from_month}_{today.strftime('%Y-%m-%d:%M-%S')}" + date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw" From 7b5646957805cf03b315bbe1ae7c873bd37781e8 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 17:16:15 +0100 Subject: [PATCH 39/78] Rename task to be more versatile --- src/bluesearch/entrypoint/database/run.py | 14 +++++++------- tests/unit/entrypoint/database/test_run.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index d8e47861c..b5f45f70a 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -319,10 +319,10 @@ def program_args(self): @requires(TopicFilterTask) -class CreateSymlinksTask(luigi.Task): - """Create folder of symlinked articles. +class PerformFilteringTask(luigi.Task): + """Create folder that only contains relevant articles. - We only symlink those articles that made it through the topic-filtering + We only consider those articles that made it through the topic-filtering stage. The only input is the `filtering.csv`. """ @@ -349,7 +349,7 @@ def create_symlink(path): accepted.apply(create_symlink) -@requires(CreateSymlinksTask) +@requires(PerformFilteringTask) class ConvertPDFTask(ExternalProgramTask): """Convert PDFs to XMLs. @@ -384,7 +384,7 @@ def output(self): return luigi.LocalTarget(str(output_file)) -@inherits(ConvertPDFTask, CreateSymlinksTask) +@inherits(ConvertPDFTask, PerformFilteringTask) class ParseTask(ExternalProgramTask): """Parse articles. @@ -397,7 +397,7 @@ def requires(self): if self.source == "arxiv": return self.clone(ConvertPDFTask) else: - return self.clone(CreateSymlinksTask) + return self.clone(PerformFilteringTask) def output(self): """Define output folder.""" @@ -520,7 +520,7 @@ def run( UnzipTask.capture_output = CAPTURE_OUTPUT TopicExtractTask.capture_output = CAPTURE_OUTPUT TopicFilterTask.capture_output = CAPTURE_OUTPUT - CreateSymlinksTask.capture_output = CAPTURE_OUTPUT + PerformFilteringTask.capture_output = CAPTURE_OUTPUT ConvertPDFTask.capture_output = CAPTURE_OUTPUT ParseTask.capture_output = CAPTURE_OUTPUT AddTask.capture_output = CAPTURE_OUTPUT diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 402563d6e..be66fac66 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -70,7 +70,7 @@ def test_run_arguments(): "DownloadTask", "TopicExtractTask", "TopicFilterTask", - "CreateSymlinksTask", + "PerformFilteringTask", "ConvertPDFTask", "ParseTask", "AddTask", @@ -82,7 +82,7 @@ def test_run_arguments(): "DownloadTask", "TopicExtractTask", "TopicFilterTask", - "CreateSymlinksTask", + "PerformFilteringTask", "ParseTask", "AddTask", ), @@ -93,7 +93,7 @@ def test_run_arguments(): "DownloadTask", "TopicExtractTask", "TopicFilterTask", - "CreateSymlinksTask", + "PerformFilteringTask", "ParseTask", "AddTask", ), @@ -105,7 +105,7 @@ def test_run_arguments(): "UnzipTask", "TopicExtractTask", "TopicFilterTask", - "CreateSymlinksTask", + "PerformFilteringTask", "ParseTask", "AddTask", ), @@ -117,7 +117,7 @@ def test_run_arguments(): "UnzipTask", "TopicExtractTask", "TopicFilterTask", - "CreateSymlinksTask", + "PerformFilteringTask", "ParseTask", "AddTask", ), From fb082c1dc5f856da456e79674c9877e7c2760357 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 17:29:01 +0100 Subject: [PATCH 40/78] Write pseudocode for pubmed peformfilter --- src/bluesearch/entrypoint/database/run.py | 27 +++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index b5f45f70a..01e541756 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -337,16 +337,29 @@ def run(self): output_dir = Path(self.output().path) filtering = pd.read_csv(self.input().path) - accepted = pd.Series(filtering[filtering.accept].path.unique()) - - def create_symlink(path): - input_path = Path(path) - output_path = output_dir / input_path.name - output_path.symlink_to(input_path) output_dir.mkdir(exist_ok=True) - accepted.apply(create_symlink) + if self.source == "pubmed": + # Find all input files (.xml.gz) + + # Iteratively Load each of the files in memory + # Create a copy of the XML + # Remove elements that were not accepted from the copy + # Store the copy with removed elements + + # Iteratively zip and save all of the "pruned" copies + pass + + else: + accepted = pd.Series(filtering[filtering.accept].path.unique()) + def create_symlink(path): + input_path = Path(path) + output_path = output_dir / input_path.name + output_path.symlink_to(input_path) + + + accepted.apply(create_symlink) @requires(PerformFilteringTask) From 1179531db0e931521634351df7092269c09979cf Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 17:33:41 +0100 Subject: [PATCH 41/78] Dont run unzipping for pubmed --- src/bluesearch/entrypoint/database/run.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 01e541756..3a1267beb 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -218,18 +218,6 @@ def run(self) -> None: shutil.copyfileobj(f_in, f_out) # type: ignore my_tar.close() - elif self.source == "pubmed": - # .xml.gz - all_zip_files = [p for p in input_dir.iterdir() if p.suffix == ".gz"] - if not all_zip_files: - raise ValueError("No zip files were found") - - for archive in all_zip_files: - output_path = output_dir / archive.stem - with gzip.open(archive, "rb") as f_in_2: - with open(output_path, "wb") as f_out_2: - shutil.copyfileobj(f_in_2, f_out_2) - else: raise ValueError(f"Unsupported source {self.source}") @@ -248,7 +236,7 @@ class TopicExtractTask(ExternalProgramTask): def requires(self) -> luigi.Task: """Define conditional dependencies.""" - if self.source in {"pmc", "pubmed"}: + if self.source in {"pmc"}: return self.clone(UnzipTask) else: return self.clone(DownloadTask) From 932478976b5a14339626e6f90b6c0ae6921f0d1d Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 15 Feb 2022 17:57:41 +0100 Subject: [PATCH 42/78] WIP-performfiltering task --- src/bluesearch/entrypoint/database/run.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 3a1267beb..1684543e7 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -18,6 +18,7 @@ from __future__ import annotations import argparse +import copy import gzip import logging import shutil @@ -27,6 +28,7 @@ import pandas as pd import sqlalchemy +from defusedxml import ElementTree import luigi from bluesearch.database.article import ArticleSource @@ -322,6 +324,7 @@ def output(self): def run(self): """Create symlinks.""" + output_dir = Path(self.output().path) filtering = pd.read_csv(self.input().path) @@ -330,10 +333,26 @@ def run(self): if self.source == "pubmed": # Find all input files (.xml.gz) + all_input_files = [Path(p) for p in filtering["path"].unique()] # Iteratively Load each of the files in memory + for input_file in all_input_files: + # Unzip it + with gzip.open(input_file) as xml_stream: + article_set = ElementTree.parse(xml_stream) + + # Create a copy of the XML - # Remove elements that were not accepted from the copy + article_set_copy = copy.deepcopy(article_set) + + # Find elements that were not accepted + to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])] + + for eif in to_remove["element_in_file"].tolist(): + # Remove the corresponding from the copy + + + # Store the copy with removed elements # Iteratively zip and save all of the "pruned" copies From 69187d010dccdc960dd748f78c00d1081d3c3109 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 16:30:25 +0100 Subject: [PATCH 43/78] Implement subtree removal logic --- src/bluesearch/entrypoint/database/run.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 1684543e7..639144925 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -337,26 +337,29 @@ def run(self): # Iteratively Load each of the files in memory for input_file in all_input_files: - # Unzip it + # Unzip it with gzip.open(input_file) as xml_stream: article_set = ElementTree.parse(xml_stream) # Create a copy of the XML - article_set_copy = copy.deepcopy(article_set) + # article_set_copy = copy.deepcopy(article_set) + root = article_set.getroot() # Find elements that were not accepted to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])] - - for eif in to_remove["element_in_file"].tolist(): - # Remove the corresponding from the copy + article_nodes = root.findall("PubmedArticle") + for eif in to_remove["element_in_file"].astype(int).tolist(): + # Remove the corresponding from the copy + root.remove(article_nodes[eif]) # Store the copy with removed elements + output_file = output_dir / input_file.stem + article_set.write(output_file) + # Zipping TODO - # Iteratively zip and save all of the "pruned" copies - pass else: accepted = pd.Series(filtering[filtering.accept].path.unique()) From 55df185bd947b9886fd030a835c2c3b5eff63a4d Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 16:44:28 +0100 Subject: [PATCH 44/78] Make luigi less verbose --- src/bluesearch/entrypoint/database/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 639144925..c60899359 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -562,7 +562,7 @@ def run( luigi_kwargs = { "tasks": [final_task], - "log_level": "DEBUG", + "log_level": "WARNING", "local_scheduler": True, } if dry_run: From f629ae1af7ce14015d543b3d554838b0d4620447 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:01:16 +0100 Subject: [PATCH 45/78] Fix the immortal bug --- src/bluesearch/entrypoint/database/topic_extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index bce22a350..7a43e21d6 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -188,7 +188,7 @@ def run( mesh_tree = mesh.MeSHTree.load(mesh_topic_db) for path in inputs: logger.info(f"Processing {path}") - with gzip.open(input_path) as xml_stream: + with gzip.open(path) as xml_stream: articles = ElementTree.parse(xml_stream) for i, article in enumerate(articles.iter("PubmedArticle")): From 9702aa3ac7c8167d8e258761431e2c468cccb1a5 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:28:05 +0100 Subject: [PATCH 46/78] Make sure PerformFilteringTask zips pubmed-article-set --- src/bluesearch/entrypoint/database/run.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index c60899359..eb32dbd16 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -29,6 +29,7 @@ import pandas as pd import sqlalchemy from defusedxml import ElementTree +from defusedxml.cElementTree import tostring import luigi from bluesearch.database.article import ArticleSource @@ -356,9 +357,10 @@ def run(self): root.remove(article_nodes[eif]) # Store the copy with removed elements - output_file = output_dir / input_file.stem - article_set.write(output_file) - # Zipping TODO + output_file = output_dir / input_file.name + out_bytes = tostring(root) + with gzip.open(output_file, 'wb') as f: + f.write(out_bytes) else: From 857541ddb25a19259b0bb8d45abca017d9463ce5 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:29:31 +0100 Subject: [PATCH 47/78] Run formatting --- src/bluesearch/entrypoint/database/run.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index eb32dbd16..637478d8d 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -342,16 +342,16 @@ def run(self): with gzip.open(input_file) as xml_stream: article_set = ElementTree.parse(xml_stream) - # Create a copy of the XML # article_set_copy = copy.deepcopy(article_set) root = article_set.getroot() # Find elements that were not accepted - to_remove = filtering[(filtering["path"] == str(input_file)) & (~filtering["accept"])] + to_remove = filtering[ + (filtering["path"] == str(input_file)) & (~filtering["accept"]) + ] article_nodes = root.findall("PubmedArticle") - for eif in to_remove["element_in_file"].astype(int).tolist(): # Remove the corresponding from the copy root.remove(article_nodes[eif]) @@ -359,18 +359,17 @@ def run(self): # Store the copy with removed elements output_file = output_dir / input_file.name out_bytes = tostring(root) - with gzip.open(output_file, 'wb') as f: - f.write(out_bytes) - + with gzip.open(output_file, "wb") as f: + f.write(out_bytes) else: accepted = pd.Series(filtering[filtering.accept].path.unique()) + def create_symlink(path): input_path = Path(path) output_path = output_dir / input_path.name output_path.symlink_to(input_path) - accepted.apply(create_symlink) From 81cad764b0549f06faa8e95ae7edd92d3aa4eabe Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:32:13 +0100 Subject: [PATCH 48/78] Make sure unit tests are passing --- src/bluesearch/entrypoint/database/run.py | 2 +- tests/unit/entrypoint/database/test_run.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 637478d8d..fc7ec5674 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -29,7 +29,7 @@ import pandas as pd import sqlalchemy from defusedxml import ElementTree -from defusedxml.cElementTree import tostring +from defusedxml.ElementTree import tostring import luigi from bluesearch.database.article import ArticleSource diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index be66fac66..acd64bf5e 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -114,7 +114,6 @@ def test_run_arguments(): "pubmed", ( "DownloadTask", - "UnzipTask", "TopicExtractTask", "TopicFilterTask", "PerformFilteringTask", From 1ad559c1db5b69e6076446621334e5171890fe16 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:40:27 +0100 Subject: [PATCH 49/78] Update sphinx --- docs/source/api/bluesearch.entrypoint.database.rst | 1 + docs/source/api/bluesearch.entrypoint.database.run.rst | 7 +++++++ 2 files changed, 8 insertions(+) create mode 100644 docs/source/api/bluesearch.entrypoint.database.run.rst diff --git a/docs/source/api/bluesearch.entrypoint.database.rst b/docs/source/api/bluesearch.entrypoint.database.rst index 9655b8576..9f3c0c5fe 100644 --- a/docs/source/api/bluesearch.entrypoint.database.rst +++ b/docs/source/api/bluesearch.entrypoint.database.rst @@ -14,6 +14,7 @@ Submodules bluesearch.entrypoint.database.parent bluesearch.entrypoint.database.parse bluesearch.entrypoint.database.parse_mesh_rdf + bluesearch.entrypoint.database.run bluesearch.entrypoint.database.schemas bluesearch.entrypoint.database.topic_extract bluesearch.entrypoint.database.topic_filter diff --git a/docs/source/api/bluesearch.entrypoint.database.run.rst b/docs/source/api/bluesearch.entrypoint.database.run.rst new file mode 100644 index 000000000..3239ab645 --- /dev/null +++ b/docs/source/api/bluesearch.entrypoint.database.run.rst @@ -0,0 +1,7 @@ +bluesearch.entrypoint.database.run module +========================================= + +.. automodule:: bluesearch.entrypoint.database.run + :members: + :undoc-members: + :show-inheritance: From 36b08f4c72c443bcb264a96cd2399bac2fb4f161 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:47:26 +0100 Subject: [PATCH 50/78] Fix linting --- src/bluesearch/entrypoint/database/run.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index fc7ec5674..153fbd66f 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -18,7 +18,6 @@ from __future__ import annotations import argparse -import copy import gzip import logging import shutil @@ -325,7 +324,6 @@ def output(self): def run(self): """Create symlinks.""" - output_dir = Path(self.output().path) filtering = pd.read_csv(self.input().path) From 6efc98476a39a898a9d7ba0334302ff5f73f322d Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:49:30 +0100 Subject: [PATCH 51/78] Update docstring --- src/bluesearch/entrypoint/database/run.py | 2 +- tox.ini | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 153fbd66f..3562fe76e 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -186,7 +186,7 @@ def program_args(self) -> list[str]: class UnzipTask(ExternalProgramTask): """Unzip raw files (if necessary). - Only applicable in case of `pubmed` and `pmc`. The unzipped files + Only applicable in case of `pmc`. The unzipped files are stored inside of `raw_unzipped`. """ diff --git a/tox.ini b/tox.ini index 791f75f4a..d0a333ebe 100644 --- a/tox.ini +++ b/tox.ini @@ -35,7 +35,7 @@ commands = pytest -m "" {posargs:tests} [testenv:lint] description = Lint using flake8, black, isort and bandit -basepython = python3.7 +basepython = python3.8 skip_install = true deps = bandit==1.7.0 @@ -69,7 +69,7 @@ commands = [testenv:format] description = Apply black and isort -basepython = python3.7 +basepython = python3.8 skip_install = true deps = black==21.5b1 From 164d0338e3c97bac411539e44ac3c5bc8ae8cd31 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 17 Feb 2022 17:55:48 +0100 Subject: [PATCH 52/78] Undo changes in tox.ini --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index d0a333ebe..791f75f4a 100644 --- a/tox.ini +++ b/tox.ini @@ -35,7 +35,7 @@ commands = pytest -m "" {posargs:tests} [testenv:lint] description = Lint using flake8, black, isort and bandit -basepython = python3.8 +basepython = python3.7 skip_install = true deps = bandit==1.7.0 @@ -69,7 +69,7 @@ commands = [testenv:format] description = Apply black and isort -basepython = python3.8 +basepython = python3.7 skip_install = true deps = black==21.5b1 From 8f1b8965ef694341c952ee6b746a5d539f5e5a26 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Fri, 18 Feb 2022 09:51:20 +0100 Subject: [PATCH 53/78] Add luigi config Otherwise imoprts raise deprecation warnings --- luigi.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 luigi.cfg diff --git a/luigi.cfg b/luigi.cfg new file mode 100644 index 000000000..b2c955f2b --- /dev/null +++ b/luigi.cfg @@ -0,0 +1,2 @@ +[core] + autoload_range=true From 83006c97d0d146d183260fa83e77fa4fe59854c4 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Fri, 18 Feb 2022 10:18:11 +0100 Subject: [PATCH 54/78] Fix isort --- src/bluesearch/entrypoint/database/run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 3562fe76e..5c87f23e0 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -25,17 +25,17 @@ from datetime import datetime from pathlib import Path +import luigi import pandas as pd import sqlalchemy from defusedxml import ElementTree from defusedxml.ElementTree import tostring - -import luigi -from bluesearch.database.article import ArticleSource from luigi.contrib.external_program import ExternalProgramTask from luigi.tools.deps_tree import print_tree from luigi.util import inherits, requires +from bluesearch.database.article import ArticleSource + logger = logging.getLogger(__name__) From 08f7c7796a75e0b082fe3f5478dbfd0f7fd1add7 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Fri, 18 Feb 2022 11:18:31 +0100 Subject: [PATCH 55/78] Try to fix sphinx warning --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index f8649cf5c..d37550bcc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,6 +31,7 @@ version = bluesearch.__version__ # -- General configuration --------------------------------------------------- +suppress_warnings = ["ref.ref"] # because of luigi.util.requires # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom From 45fadf2aa862d49aef096867eb994b4127a92d2c Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Fri, 18 Feb 2022 11:19:19 +0100 Subject: [PATCH 56/78] Remove custom_timeout from the source code However, still very useful locally --- src/bluesearch/entrypoint/database/run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 5c87f23e0..9e6efebee 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -172,7 +172,6 @@ def program_args(self) -> list[str]: """Define subprocess arguments.""" output_dir = self.output().path return [ - "custom_timeout", *BBS_BINARY, "download", *VERBOSITY, From 09b338e9c0cff119f86c8280caa3324f7241d6ea Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Fri, 18 Feb 2022 12:27:02 +0100 Subject: [PATCH 57/78] Add type annotations everywhere --- src/bluesearch/entrypoint/database/run.py | 28 +++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 9e6efebee..88857bfca 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -249,7 +249,7 @@ def output(self) -> luigi.LocalTarget: return luigi.LocalTarget(str(output_file)) - def program_args(self): + def program_args(self) -> list[str]: """Define subprocess arguments.""" input_dir = self.input().path output_dir = self.output().path @@ -284,13 +284,13 @@ class TopicFilterTask(ExternalProgramTask): filter_config = luigi.Parameter() - def output(self): + def output(self) -> luigi.LocalTarget: """Define output file.""" output_file = Path(self.input().path).parent / "filtering.csv" return luigi.LocalTarget(str(output_file)) - def program_args(self): + def program_args(self) -> list[str]: """Define subprocess arguments.""" extracted_topics = self.input().path output_file = self.output().path @@ -315,13 +315,13 @@ class PerformFilteringTask(luigi.Task): stage. The only input is the `filtering.csv`. """ - def output(self): + def output(self) -> luigi.LocalTarget: """Define output folder.""" output_dir = Path(self.input().path).parent / "filtered" return luigi.LocalTarget(str(output_dir)) - def run(self): + def run(self) -> None: """Create symlinks.""" output_dir = Path(self.output().path) @@ -381,7 +381,7 @@ class ConvertPDFTask(ExternalProgramTask): grobid_host = luigi.Parameter() grobid_port = luigi.IntParameter() - def program_args(self): + def program_args(self) -> list[str]: """Define subprocess arguments.""" input_dir = Path(self.input().path).parent / "filtered" output_dir = self.output().path @@ -398,7 +398,7 @@ def program_args(self): return command - def output(self): + def output(self) -> luigi.LocalTarget: """Define output folder.""" output_file = Path(self.input().path).parent / "converted_pdfs" @@ -413,20 +413,20 @@ class ParseTask(ExternalProgramTask): `source="arxiv"` `converted_pdfs/`. """ - def requires(self): + def requires(self) -> luigi.Task: """Define conditional dependencies.""" if self.source == "arxiv": return self.clone(ConvertPDFTask) else: return self.clone(PerformFilteringTask) - def output(self): + def output(self) -> luigi.LocalTarget: """Define output folder.""" output_file = Path(self.input().path).parent / "parsed" return luigi.LocalTarget(str(output_file)) - def program_args(self): + def program_args(self) -> list[str]: """Define subprocess arguments.""" output_dir = Path(self.output().path) output_dir.mkdir(exist_ok=True) @@ -451,8 +451,8 @@ def program_args(self): "parse", *VERBOSITY, parser, - input_dir, - output_dir, + str(input_dir), + str(output_dir), ] return command @@ -469,7 +469,7 @@ class AddTask(ExternalProgramTask): db_url = luigi.Parameter() db_type = luigi.Parameter() - def complete(self): + def complete(self) -> bool: """Check if all articles inside of `parsed/` are in the database.""" # If all the articles are inside if self.db_type == "sqlite": @@ -501,7 +501,7 @@ def complete(self): return not new_uids - def program_args(self): + def program_args(self) -> list[str]: """Define subprocess arguments.""" input_dir = Path(self.input().path) From eb8bce61e7fae510dbab8550ee94f30eb6d42801 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 21 Feb 2022 13:59:06 +0100 Subject: [PATCH 58/78] Add custom identifier logic --- src/bluesearch/entrypoint/database/run.py | 16 ++++++++++++++-- tests/unit/entrypoint/database/test_run.py | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 88857bfca..1451ae077 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -137,6 +137,11 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: type=int, help="The port of the GROBID server.", ) + parser.add_argument( + "--identifier", + type=str, + help="Custom name of the identifier. If not specified, we use `from-month_today`", + ) return parser @@ -156,15 +161,20 @@ class DownloadTask(ExternalProgramTask): source = luigi.Parameter() from_month = luigi.Parameter() output_dir = luigi.Parameter() + identifier = luigi.OptionalParameter() def output(self) -> luigi.LocalTarget: """Define download folder.""" global OUTPUT_DIR_RAW if OUTPUT_DIR_RAW is None: today = datetime.today() - date = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" + if self.identifier is None: + identifier = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" + else: + identifier = self.identifier + - OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / date / "raw" + OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw" return luigi.LocalTarget(str(OUTPUT_DIR_RAW)) @@ -529,6 +539,7 @@ def run( dry_run: bool, grobid_host: str | None, grobid_port: int | None, + identifier: str | None, ) -> int: """Run overall pipeline. @@ -556,6 +567,7 @@ def run( grobid_port=grobid_port, db_url=db_url, db_type=db_type, + identifier=identifier, ) luigi_kwargs = { diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index acd64bf5e..bb49fb289 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -34,6 +34,7 @@ "dry_run", "grobid_host", "grobid_port", + "identifier", } @@ -135,6 +136,7 @@ def test_pipelines(source, tasks, tmp_path, capsys): grobid_port=1234, db_url="whatever", db_type="sqlite", + identifier=None, ) captured = capsys.readouterr() From 2e9e576d29f9ae2057153a9ec1eb72360b57aec6 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 21 Feb 2022 13:59:49 +0100 Subject: [PATCH 59/78] Reformat --- src/bluesearch/entrypoint/database/run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 1451ae077..53e9211df 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -173,7 +173,6 @@ def output(self) -> luigi.LocalTarget: else: identifier = self.identifier - OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw" return luigi.LocalTarget(str(OUTPUT_DIR_RAW)) From 3c17bb3773d1aca4931a4de80fe66c1ce3d748f0 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 21 Feb 2022 14:07:54 +0100 Subject: [PATCH 60/78] Break the line --- src/bluesearch/entrypoint/database/run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 53e9211df..c6352e179 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -140,7 +140,9 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument( "--identifier", type=str, - help="Custom name of the identifier. If not specified, we use `from-month_today`", + help="""Custom name of the identifier. If not specified, we use + `from-month_today` + """, ) return parser From 7c50ca45e266072da5861afc31cb29000d9aa19c Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 21 Feb 2022 14:10:40 +0100 Subject: [PATCH 61/78] Fix typos --- src/bluesearch/entrypoint/database/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index c6352e179..a2866e601 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -238,7 +238,7 @@ def run(self) -> None: class TopicExtractTask(ExternalProgramTask): """Topic extraction. - The input of this dask is either `raw/` or `raw_unzipped/` depending + The input of this task is either `raw/` or `raw_unzipped/` depending on the source. The output is going to be a single file `topic_infos.jsonl`. """ @@ -344,7 +344,7 @@ def run(self) -> None: # Find all input files (.xml.gz) all_input_files = [Path(p) for p in filtering["path"].unique()] - # Iteratively Load each of the files in memory + # Iteratively load each of the files in memory for input_file in all_input_files: # Unzip it with gzip.open(input_file) as xml_stream: From 927fc89c844744038517f8b0928c5ec16c7f8149 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Mon, 21 Feb 2022 14:14:02 +0100 Subject: [PATCH 62/78] Add forgotten bracket --- src/bluesearch/entrypoint/database/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index a2866e601..89ce85082 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -421,7 +421,7 @@ class ParseTask(ExternalProgramTask): """Parse articles. The input is all the articles inside of `filtered/` (or in case of - `source="arxiv"` `converted_pdfs/`. + `source="arxiv"` `converted_pdfs/`). """ def requires(self) -> luigi.Task: From 25e5fed4d9c59377d82ba8f58c5fa4ddf2e3b797 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 22 Feb 2022 13:51:58 +0100 Subject: [PATCH 63/78] Add recursive enumeration to pubmed --- src/bluesearch/entrypoint/database/run.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 89ce85082..020b7b239 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -282,6 +282,12 @@ def program_args(self) -> list[str]: if self.source in {"pmc", "pubmed"}: command.append(f"--mesh-topic-db={self.mesh_topic_db}") + if self.source == "pubmed": + command.extend( + ["-R", "-m", r".*\.xml\.gz$"], + ) + + return command From 0d5f772d1794f3b8b8aa3d214fd9f7f7b1ee3933 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 22 Feb 2022 13:57:30 +0100 Subject: [PATCH 64/78] Add logging for each element in file --- src/bluesearch/entrypoint/database/topic_extract.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index 7a43e21d6..3a116939c 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -192,6 +192,7 @@ def run( articles = ElementTree.parse(xml_stream) for i, article in enumerate(articles.iter("PubmedArticle")): + logger.info(f"Processing element in file {i}") topic_info = TopicInfo( source=article_source, path=path.resolve(), From cdc0d873145bc3da9297cac6ea6dd07d7fa61739 Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Mon, 21 Feb 2022 14:14:28 +0100 Subject: [PATCH 65/78] Skip download for arXiv articles with broken ID or version (#586) --- src/bluesearch/database/download.py | 22 ++++++++++++---------- tests/unit/database/test_download.py | 24 +++++++++++++++++------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/bluesearch/database/download.py b/src/bluesearch/database/download.py index 06493351c..b3be9d07e 100644 --- a/src/bluesearch/database/download.py +++ b/src/bluesearch/database/download.py @@ -246,21 +246,23 @@ def get_gcs_urls( client = bucket.client + def _extract_blob_info(blob: Blob) -> tuple[Blob, str, str, int] | None: + try: + name = blob.name + full_name = blob.name.rsplit("v", 1)[0] + article = int(blob.name.rsplit("v", 1)[1].split(".")[0]) + except ValueError: + return None + return blob, name, full_name, article + url_dict = {} for yearmonth in yearmonth_list: - iterator = client.list_blobs(bucket, prefix=f"arxiv/arxiv/pdf/{yearmonth}") + all_blobs = client.list_blobs(bucket, prefix=f"arxiv/arxiv/pdf/{yearmonth}") # If more than one version is found, we only keep the last one + blobs_info = (_extract_blob_info(blob) for blob in all_blobs) df = pd.DataFrame( - ( - ( - el, - el.name, - el.name.rsplit("v", 1)[0], - int(el.name.rsplit("v", 1)[1].split(".")[0]), - ) - for el in iterator - ), + (info for info in blobs_info if info is not None), columns=["blob", "fullname", "article", "version"], ) diff --git a/tests/unit/database/test_download.py b/tests/unit/database/test_download.py index 34928e915..3981dc758 100644 --- a/tests/unit/database/test_download.py +++ b/tests/unit/database/test_download.py @@ -183,32 +183,42 @@ def test_get_gcs_urls(): fake_client = Mock() fake_bucket = Bucket(fake_client, "my_dir/file.txt") fake_blobs_by_prefix = { + "arxiv/arxiv/pdf/2109": [ + Blob("topic-a/99.6767v1.1.pdf", fake_bucket), # invalid version + Blob("topic-v/99.6767v1.2.pdf", fake_bucket), # invalid version + Blob("topic-v/99.6767v1a.pdf", fake_bucket), # invalid version + Blob("topic-v/99.6767v10.pdf", fake_bucket), + Blob("topic-v/99.6767v3.pdf", fake_bucket), # older version + ], "arxiv/arxiv/pdf/2110": [ - Blob("topic-a/12.3450v1.pdf", fake_bucket), + Blob("topic-a/12.3450v1.pdf", fake_bucket), # older version Blob("topic-v/12.3450v2.pdf", fake_bucket), ], "arxiv/arxiv/pdf/2111": [ - Blob("topic-v/99.3450v2.pdf", fake_bucket), - Blob("topic-v/99.3450v3.pdf", fake_bucket), + Blob("topic-v/99.3450v2.pdf", fake_bucket), # older version + Blob("topic-v/99.3450v3.pdf", fake_bucket), # older version Blob("topic-v/99.3450v10.pdf", fake_bucket), ], "arxiv/arxiv/pdf/2112": [ Blob("topic-v/33.1v2.pdf", fake_bucket), Blob("topic-v/44.1v2.pdf", fake_bucket), Blob("topic-v/55.1v2.pdf", fake_bucket), - Blob("topic-v/55.1v1.pdf", fake_bucket), + Blob("topic-v/55.1v1.pdf", fake_bucket), # older version ], } fake_client.list_blobs.side_effect = lambda bucket, prefix: fake_blobs_by_prefix[ prefix ] - start_date = datetime(2021, 10, 1) + start_date = datetime(2021, 9, 1) end_date = datetime(2021, 12, 1) blobs_by_month = get_gcs_urls(fake_bucket, start_date, end_date) - assert fake_client.list_blobs.call_count == 3 - assert set(blobs_by_month) == {"2110", "2111", "2112"} + assert fake_client.list_blobs.call_count == 4 + assert set(blobs_by_month) == {"2109", "2110", "2111", "2112"} + assert set(blobs_by_month["2109"]) == set( + fake_blobs_by_prefix["arxiv/arxiv/pdf/2109"][-2:-1] + ) assert set(blobs_by_month["2110"]) == set( fake_blobs_by_prefix["arxiv/arxiv/pdf/2110"] ) From c9dcb395bac93fb4b78b258278c131bfb149ff27 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Wed, 23 Feb 2022 21:58:38 +0100 Subject: [PATCH 66/78] Add separate try except blocks for each source --- .../entrypoint/database/topic_extract.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index 3a116939c..dc8dafb76 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -173,7 +173,11 @@ def run( for path in inputs: logger.info(f"Processing {path}") topic_info = TopicInfo(source=article_source, path=path.resolve()) - journal_topics = get_topics_for_pmc_article(path) + try: + journal_topics = get_topics_for_pmc_article(path) + except Exception: + logger.error(f"Failed to extract topic from {path}") + if journal_topics: topic_info.add_journal_topics( "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) @@ -198,8 +202,12 @@ def run( path=path.resolve(), element_in_file=i, ) - article_topics = extract_article_topics_for_pubmed_article(article) - journal_topics = extract_journal_topics_for_pubmed_article(article) + try: + article_topics = extract_article_topics_for_pubmed_article(article) + journal_topics = extract_journal_topics_for_pubmed_article(article) + except Exception: + logger.error(f"Failed to extract topic from {i}") + if article_topics: topic_info.add_article_topics( "MeSH", mesh.resolve_parents(article_topics, mesh_tree) @@ -212,7 +220,11 @@ def run( elif article_source is ArticleSource.ARXIV: for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): topic_info = TopicInfo(source=article_source, path=path) - topic_info.add_article_topics("arXiv", article_topics) + try: + topic_info.add_article_topics("arXiv", article_topics) + except Exception: + logger.error(f"Failed to extract topic from {path}") + all_results.append(topic_info.json()) elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: for path in inputs: @@ -220,7 +232,11 @@ def run( topic, journal = extract_article_topics_from_medrxiv_article(path) journal = journal.lower() topic_info = TopicInfo(source=ArticleSource(journal), path=path) - topic_info.add_article_topics("Subject Area", [topic]) + try: + topic_info.add_article_topics("Subject Area", [topic]) + except Exception: + logger.error(f"Failed to extract topic from {path}") + all_results.append(topic_info.json()) else: logger.error(f"The source type {source!r} is not implemented yet") From 587e239f651145b16c522f39904a7ae6dd2a4b6a Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Wed, 23 Feb 2022 22:12:51 +0100 Subject: [PATCH 67/78] Fix bug --- src/bluesearch/entrypoint/database/topic_extract.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index dc8dafb76..b045d364d 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -220,22 +220,19 @@ def run( elif article_source is ArticleSource.ARXIV: for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): topic_info = TopicInfo(source=article_source, path=path) - try: - topic_info.add_article_topics("arXiv", article_topics) - except Exception: - logger.error(f"Failed to extract topic from {path}") + topic_info.add_article_topics("arXiv", article_topics) all_results.append(topic_info.json()) elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: for path in inputs: logger.info(f"Processing {path}") - topic, journal = extract_article_topics_from_medrxiv_article(path) - journal = journal.lower() - topic_info = TopicInfo(source=ArticleSource(journal), path=path) try: - topic_info.add_article_topics("Subject Area", [topic]) + topic, journal = extract_article_topics_from_medrxiv_article(path) except Exception: logger.error(f"Failed to extract topic from {path}") + journal = journal.lower() + topic_info = TopicInfo(source=ArticleSource(journal), path=path) + topic_info.add_article_topics("Subject Area", [topic]) all_results.append(topic_info.json()) else: From 03e8af18d3238fa16ddab192ca60b39edfc336e8 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Wed, 23 Feb 2022 22:25:41 +0100 Subject: [PATCH 68/78] Format nicely --- src/bluesearch/entrypoint/database/run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 020b7b239..f60bdb098 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -287,7 +287,6 @@ def program_args(self) -> list[str]: ["-R", "-m", r".*\.xml\.gz$"], ) - return command From 95847c5175d6e50e9d581d20bde6e140e19efd35 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 24 Feb 2022 16:59:12 +0100 Subject: [PATCH 69/78] Add the possibility of early stoppping --- src/bluesearch/entrypoint/database/run.py | 48 +++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index f60bdb098..75f070495 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -144,6 +144,13 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: `from-month_today` """, ) + parser.add_argument( + "--final-task", + type=str, + help="""Name of the task where to manually stop the pipeline. Note + that the task itself will be included. + """, + ) return parser @@ -533,6 +540,30 @@ def program_args(self) -> list[str]: return command +def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]: + """Get all dependencies of a given task. + + Parameters + ---------- + task + Input task + + Returns + ------- + set[luigi.Task] + All the tasks that the `input` depends on. + """ + current_deps = set(task.deps()) + if not current_deps: + return set() + + else: + deps = set() + for current_dep in current_deps: + deps |= get_all_dependencies(current_dep) + + return deps | current_deps + def run( *, source: str, @@ -546,6 +577,7 @@ def run( grobid_host: str | None, grobid_port: int | None, identifier: str | None, + final_task: str | None, ) -> int: """Run overall pipeline. @@ -563,7 +595,7 @@ def run( ParseTask.capture_output = CAPTURE_OUTPUT AddTask.capture_output = CAPTURE_OUTPUT - final_task = AddTask( + add_task_inst = AddTask( source=source, from_month=from_month, filter_config=str(filter_config), @@ -575,14 +607,24 @@ def run( db_type=db_type, identifier=identifier, ) + if final_task is None: + selected_task_inst = add_task_inst + else: + all_dependencies = get_all_dependencies(add_task_inst) + all_dependencies_map = {t.__class__.__name__: t for t in all_dependencies} + + if final_task in all_dependencies_map: + selected_task_inst = all_dependencies_map[final_task] + else: + raise ValueError(f"Unrecognized final task {final_task}") luigi_kwargs = { - "tasks": [final_task], + "tasks": [selected_task_inst], "log_level": "WARNING", "local_scheduler": True, } if dry_run: - print(print_tree(final_task, last=False)) + print(print_tree(selected_task_inst, last=False)) else: luigi.build(**luigi_kwargs) From 5e255d429b6ebc8a2c45b3c6e73a2d3c6d1b44ff Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 24 Feb 2022 18:38:51 +0100 Subject: [PATCH 70/78] Small modification --- src/bluesearch/entrypoint/database/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 75f070495..9a1ff4b9a 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -551,14 +551,14 @@ def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]: Returns ------- set[luigi.Task] - All the tasks that the `input` depends on. + All the tasks that the `input` depends on including itself. """ current_deps = set(task.deps()) if not current_deps: return set() else: - deps = set() + deps = {task} for current_dep in current_deps: deps |= get_all_dependencies(current_dep) From 93569a30ebadc649da8214edb9c8b0fff7929893 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 24 Feb 2022 19:05:40 +0100 Subject: [PATCH 71/78] Add iffy tests --- src/bluesearch/entrypoint/database/run.py | 21 ++--- tests/unit/entrypoint/database/test_run.py | 94 ++++++++++++++++++++++ 2 files changed, 106 insertions(+), 9 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 9a1ff4b9a..d01cf3438 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -158,7 +158,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: BBS_BINARY = ["bbs_database"] VERBOSITY = ["-v"] # for the entrypoint subprocesses CAPTURE_OUTPUT = False -OUTPUT_DIR_RAW = None # make sure the same datestamp for all tasks +IDENTIFIER = None # make sure the same for all tasks class DownloadTask(ExternalProgramTask): @@ -174,17 +174,21 @@ class DownloadTask(ExternalProgramTask): def output(self) -> luigi.LocalTarget: """Define download folder.""" - global OUTPUT_DIR_RAW - if OUTPUT_DIR_RAW is None: - today = datetime.today() - if self.identifier is None: + global IDENTIFIER + if self.identifier is not None: + identifier = self.identifier + + else: + if IDENTIFIER is None: + today = datetime.today() identifier = f"{self.from_month}_{today.strftime('%Y-%m-%d')}" + IDENTIFIER = identifier else: - identifier = self.identifier + identifier = IDENTIFIER - OUTPUT_DIR_RAW = Path(self.output_dir) / self.source / identifier / "raw" + output_dir = Path(self.output_dir) / self.source / identifier / "raw" - return luigi.LocalTarget(str(OUTPUT_DIR_RAW)) + return luigi.LocalTarget(str(output_dir)) def program_args(self) -> list[str]: """Define subprocess arguments.""" @@ -452,7 +456,6 @@ def output(self) -> luigi.LocalTarget: def program_args(self) -> list[str]: """Define subprocess arguments.""" output_dir = Path(self.output().path) - output_dir.mkdir(exist_ok=True) if (output_dir.parent / "converted_pdfs").exists(): input_dir = output_dir.parent / "converted_pdfs" diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index bb49fb289..b40705b92 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -18,6 +18,8 @@ import argparse import inspect import pathlib +from subprocess import Popen +from unittest.mock import Mock import pytest @@ -35,6 +37,7 @@ "grobid_host", "grobid_port", "identifier", + "final_task", } @@ -137,6 +140,7 @@ def test_pipelines(source, tasks, tmp_path, capsys): db_url="whatever", db_type="sqlite", identifier=None, + final_task=None, ) captured = capsys.readouterr() @@ -145,3 +149,93 @@ def test_pipelines(source, tasks, tmp_path, capsys): for stdout_line, task in zip(stdout_lines, tasks): assert task in stdout_line assert "PENDING" in stdout_line + + +@pytest.mark.parametrize( + "source", + [ + "arxiv", + "biorxiv", + "medrxiv", + "pmc", + "pubmed", + ] +) +def test_all( + tmp_path, + monkeypatch, + source, +): + identifier = "ABC" + root_dir = tmp_path / source / identifier + + fake_Popen_inst = Mock(spec=Popen) + fake_Popen_inst.returncode = 0 + + def create_output(args, **kwargs): + entrypoint = args[1] + + if entrypoint == "download": + output_path = root_dir / "raw/" + output_path.mkdir(parents=True) + + elif entrypoint == "topic-extract": + output_path = root_dir / "topic_infos.jsonl" + output_path.touch() + + elif entrypoint == "topic-filter": + output_path = root_dir / "filtering.csv" + output_path.touch() + + elif entrypoint == "convert-pdf": + output_path = root_dir / "converted_pdfs/" + output_path.mkdir() + + elif entrypoint == "parse": + output_path = root_dir / "parsed/" + output_path.mkdir() + + elif entrypoint == "add": + pass + + return fake_Popen_inst + + + fake_Popen_class = Mock(side_effect=create_output) + monkeypatch.setattr("subprocess.Popen", fake_Popen_class) + monkeypatch.setattr(run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir()) + monkeypatch.setattr(run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir()) + monkeypatch.setattr(run.AddTask, "complete", lambda _: False) + + run.run( + source=source, + from_month="1234-11", + filter_config=pathlib.Path("aa"), + output_dir=tmp_path, + dry_run=False, + mesh_topic_db=pathlib.Path("whatever"), + grobid_host="112431321", + grobid_port=8000, + db_url="whatever", + db_type="sqlite", + identifier=identifier, + final_task="AddTask", + ) + assert (root_dir / "raw").exists() + if source == "pmc": + assert (root_dir / "raw_unzipped").exists() + + assert (root_dir / "topic_infos.jsonl").exists() + assert (root_dir / "filtering.csv").exists() + assert (root_dir / "filtered").exists() + + if source == "arxiv": + assert (root_dir / "converted_pdfs").exists() + + assert (root_dir / "parsed").exists() + + if source == "arxiv": + assert fake_Popen_class.call_count == 6 + else: + assert fake_Popen_class.call_count == 5 + From 8620c8707d693e5874650e0faa1eb098f79f7d83 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 24 Feb 2022 19:06:30 +0100 Subject: [PATCH 72/78] Run formatter --- src/bluesearch/entrypoint/database/run.py | 1 + tests/unit/entrypoint/database/test_run.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index d01cf3438..22ceff5d2 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -567,6 +567,7 @@ def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]: return deps | current_deps + def run( *, source: str, diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index b40705b92..2e74c2a09 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -152,14 +152,14 @@ def test_pipelines(source, tasks, tmp_path, capsys): @pytest.mark.parametrize( - "source", + "source", [ "arxiv", "biorxiv", "medrxiv", "pmc", "pubmed", - ] + ], ) def test_all( tmp_path, @@ -171,7 +171,7 @@ def test_all( fake_Popen_inst = Mock(spec=Popen) fake_Popen_inst.returncode = 0 - + def create_output(args, **kwargs): entrypoint = args[1] @@ -200,11 +200,14 @@ def create_output(args, **kwargs): return fake_Popen_inst - fake_Popen_class = Mock(side_effect=create_output) monkeypatch.setattr("subprocess.Popen", fake_Popen_class) - monkeypatch.setattr(run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir()) - monkeypatch.setattr(run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir()) + monkeypatch.setattr( + run.UnzipTask, "run", lambda _: (root_dir / "raw_unzipped").mkdir() + ) + monkeypatch.setattr( + run.PerformFilteringTask, "run", lambda _: (root_dir / "filtered/").mkdir() + ) monkeypatch.setattr(run.AddTask, "complete", lambda _: False) run.run( @@ -238,4 +241,3 @@ def create_output(args, **kwargs): assert fake_Popen_class.call_count == 6 else: assert fake_Popen_class.call_count == 5 - From efa6821b859c6418839f4123c1fb856e84381b01 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Thu, 24 Feb 2022 19:10:14 +0100 Subject: [PATCH 73/78] Ignore a luigi warning --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 791f75f4a..382503f7e 100644 --- a/tox.ini +++ b/tox.ini @@ -153,6 +153,7 @@ testpaths = tests filterwarnings = error ignore::DeprecationWarning:docker.*: + ignore::DeprecationWarning:luigi.task: addopts = --cov --cov-config=tox.ini From 7c815957f5244fe9cf175bc8297b2b382ac6eb7e Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Tue, 8 Mar 2022 11:07:15 +0100 Subject: [PATCH 74/78] Use context manager --- src/bluesearch/entrypoint/database/run.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 22ceff5d2..4660e0ba1 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -232,14 +232,13 @@ def run(self) -> None: all_tar_files = input_dir.rglob("*.tar.gz") for archive in all_tar_files: output_path = output_dir / archive.stem - my_tar = tarfile.open(archive) - all_articles = [x for x in my_tar.getmembers() if x.isfile()] - for article in all_articles: - output_path = output_dir / article.path.rpartition("/")[2] - f_in = my_tar.extractfile(article) - with open(output_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) # type: ignore - my_tar.close() + with tarfile.open(archive) as my_tar: + all_articles = [x for x in my_tar.getmembers() if x.isfile()] + for article in all_articles: + output_path = output_dir / article.path.rpartition("/")[2] + f_in = my_tar.extractfile(article) + with open(output_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) # type: ignore else: raise ValueError(f"Unsupported source {self.source}") From 95827a23e414cf11b9e03a06bc327e6acc231150 Mon Sep 17 00:00:00 2001 From: Jan Krepl Date: Wed, 9 Mar 2022 14:07:52 +0100 Subject: [PATCH 75/78] Move luigi parameters to a config file --- luigi.cfg | 2 ++ src/bluesearch/entrypoint/database/run.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/luigi.cfg b/luigi.cfg index b2c955f2b..8830e6998 100644 --- a/luigi.cfg +++ b/luigi.cfg @@ -1,2 +1,4 @@ [core] autoload_range=true + log_level = INFO + local_scheduler = True diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 4660e0ba1..824ea85ad 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -623,8 +623,6 @@ def run( luigi_kwargs = { "tasks": [selected_task_inst], - "log_level": "WARNING", - "local_scheduler": True, } if dry_run: print(print_tree(selected_task_inst, last=False)) From 0ff13e4850165d83fa18cef0b78f667fa8016807 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Tue, 15 Mar 2022 13:10:42 +0100 Subject: [PATCH 76/78] Remove requires/inherits decorator --- luigi.cfg | 23 ++ src/bluesearch/entrypoint/database/run.py | 265 +++++++-------------- tests/unit/entrypoint/database/test_run.py | 61 ++--- 3 files changed, 127 insertions(+), 222 deletions(-) diff --git a/luigi.cfg b/luigi.cfg index 8830e6998..5f9baf652 100644 --- a/luigi.cfg +++ b/luigi.cfg @@ -2,3 +2,26 @@ autoload_range=true log_level = INFO local_scheduler = True + +[GlobalParams] + source=pubmed + +[DownloadTask] + from_month=2021-12 + output_dir=luigi-pipeline + identifier= + ; emtpy string is considered default value + +[TopicExtractTask] + mesh_topic_db=luigi-pipeline/mesh_topic_db.json + +[TopicFilterTask] + filter_config=luigi-pipeline/filter-config.jsonl + +[ConvertPDFTask] + grobid_host=0.0.0.0 + grobid_port=8070 + +[AddTask] + db_url=luigi-pipeline/my-db.db + db_type=sqlite \ No newline at end of file diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 824ea85ad..5bfb3361a 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -20,6 +20,8 @@ import argparse import gzip import logging +import pathlib +import re import shutil import tarfile from datetime import datetime @@ -32,9 +34,6 @@ from defusedxml.ElementTree import tostring from luigi.contrib.external_program import ExternalProgramTask from luigi.tools.deps_tree import print_tree -from luigi.util import inherits, requires - -from bluesearch.database.article import ArticleSource logger = logging.getLogger(__name__) @@ -56,70 +55,29 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.description = "Run the overall pipeline." parser.add_argument( - "--source", - required=True, - type=str, - choices=[member.value for member in ArticleSource], - help="Source of the articles.", - ) - parser.add_argument( - "--from-month", - required=True, + "--final-task", type=str, - help="The starting month (included) for the download in format YYYY-MM. " - "All papers from the given month until today will be downloaded.", - ) - parser.add_argument( - "--filter-config", - required=True, - type=Path, - help=""" - Path to a .JSONL file that defines all the rules for filtering. - """, + choices=( + "DownloadTask", + "UnzipTask", + "TopicExtractTask", + "TopicFilterTask", + "PerformFilteringTask", + "ConvertPDFTask", + "ParseTask", + "AddTask", + ), + help="Final task of the luigi pipeline.", ) parser.add_argument( - "--output-dir", - required=True, + "--config-path", type=Path, - help=""" - Path to the output folder. All the results stored under - `output_dir/source/date` where date is concatenation of the - `from_month` and the day of execution of this command. - """, - ) - parser.add_argument( - "--db-url", - required=True, - type=str, - help=""" - The location of the database depending on the database type. - - For MySQL and MariaDB the server URL should be provided, for SQLite the - location of the database file. Generally, the scheme part of - the URL should be omitted, e.g. for MySQL the URL should be - of the form 'my_sql_server.ch:1234/my_database' and for SQLite - of the form '/path/to/the/local/database.db'. - """, + help="Configuration Path.", ) parser.add_argument( - "--db-type", - default="sqlite", + "--luigi-config", type=str, - choices=("mariadb", "mysql", "postgres", "sqlite"), - help="Type of the database.", - ) - parser.add_argument( - "--mesh-topic-db", - type=Path, - help=""" - The JSON file with MeSH topic hierarchy information. Mandatory for - source types "pmc" and "pubmed". - - The JSON file should contain a flat dictionary with MeSH topic tree - numbers mapped to the corresponding topic labels. This file can be - produced using the `bbs_database parse-mesh-rdf` command. See that - command's description for more details. - """, + help="Configuration parameters.", ) parser.add_argument( "--dry-run", @@ -127,30 +85,6 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: action="store_true", help="Prints out a diagram of the pipeline without running it.", ) - parser.add_argument( - "--grobid-host", - type=str, - help="The host of the GROBID server.", - ) - parser.add_argument( - "--grobid-port", - type=int, - help="The port of the GROBID server.", - ) - parser.add_argument( - "--identifier", - type=str, - help="""Custom name of the identifier. If not specified, we use - `from-month_today` - """, - ) - parser.add_argument( - "--final-task", - type=str, - help="""Name of the task where to manually stop the pipeline. Note - that the task itself will be included. - """, - ) return parser @@ -161,13 +95,18 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: IDENTIFIER = None # make sure the same for all tasks +class GlobalParams(luigi.Config): + """Global configuration.""" + + source = luigi.Parameter() + + class DownloadTask(ExternalProgramTask): """Download raw files. They will be stored in the `raw/` folder. """ - source = luigi.Parameter() from_month = luigi.Parameter() output_dir = luigi.Parameter() identifier = luigi.OptionalParameter() @@ -186,7 +125,7 @@ def output(self) -> luigi.LocalTarget: else: identifier = IDENTIFIER - output_dir = Path(self.output_dir) / self.source / identifier / "raw" + output_dir = Path(self.output_dir) / GlobalParams().source / identifier / "raw" return luigi.LocalTarget(str(output_dir)) @@ -197,13 +136,12 @@ def program_args(self) -> list[str]: *BBS_BINARY, "download", *VERBOSITY, - self.source, + GlobalParams().source, self.from_month, output_dir, ] -@requires(DownloadTask) class UnzipTask(ExternalProgramTask): """Unzip raw files (if necessary). @@ -211,7 +149,10 @@ class UnzipTask(ExternalProgramTask): are stored inside of `raw_unzipped`. """ - source = luigi.Parameter() + @staticmethod + def requires() -> luigi.Task: + """Define dependency.""" + return DownloadTask() def output(self) -> luigi.LocalTarget: """Define unzipping folder.""" @@ -226,7 +167,7 @@ def run(self) -> None: output_dir = Path(self.output().path) # raw_unzipped output_dir.mkdir(exist_ok=True, parents=True) - if self.source == "pmc": + if GlobalParams().source == "pmc": # .tar.gz # We want collapse the folder hierarchy all_tar_files = input_dir.rglob("*.tar.gz") @@ -241,10 +182,9 @@ def run(self) -> None: shutil.copyfileobj(f_in, f_out) # type: ignore else: - raise ValueError(f"Unsupported source {self.source}") + raise ValueError(f"Unsupported source {GlobalParams().source}") -@inherits(DownloadTask, UnzipTask) class TopicExtractTask(ExternalProgramTask): """Topic extraction. @@ -253,15 +193,15 @@ class TopicExtractTask(ExternalProgramTask): `topic_infos.jsonl`. """ - source = luigi.Parameter() mesh_topic_db = luigi.Parameter() - def requires(self) -> luigi.Task: + @staticmethod + def requires() -> luigi.Task: """Define conditional dependencies.""" - if self.source in {"pmc"}: - return self.clone(UnzipTask) + if GlobalParams().source in {"pmc"}: + return UnzipTask() else: - return self.clone(DownloadTask) + return DownloadTask() def output(self) -> luigi.LocalTarget: """Define output file path.""" @@ -279,20 +219,20 @@ def program_args(self) -> list[str]: *BBS_BINARY, "topic-extract", *VERBOSITY, - self.source, + GlobalParams().source, input_dir, output_dir, ] - if self.source in {"medrxiv", "biorxiv"}: + if GlobalParams().source in {"medrxiv", "biorxiv"}: command.extend( ["-R", "-m", r".*\.meca$"], ) - if self.source in {"pmc", "pubmed"}: + if GlobalParams().source in {"pmc", "pubmed"}: command.append(f"--mesh-topic-db={self.mesh_topic_db}") - if self.source == "pubmed": + if GlobalParams().source == "pubmed": command.extend( ["-R", "-m", r".*\.xml\.gz$"], ) @@ -300,7 +240,6 @@ def program_args(self) -> list[str]: return command -@requires(TopicExtractTask) class TopicFilterTask(ExternalProgramTask): """Run topic filtering entrypoint. @@ -310,6 +249,11 @@ class TopicFilterTask(ExternalProgramTask): filter_config = luigi.Parameter() + @staticmethod + def requires() -> luigi.Task: + """Define dependency.""" + return TopicExtractTask() + def output(self) -> luigi.LocalTarget: """Define output file.""" output_file = Path(self.input().path).parent / "filtering.csv" @@ -333,7 +277,6 @@ def program_args(self) -> list[str]: return command -@requires(TopicFilterTask) class PerformFilteringTask(luigi.Task): """Create folder that only contains relevant articles. @@ -341,6 +284,11 @@ class PerformFilteringTask(luigi.Task): stage. The only input is the `filtering.csv`. """ + @staticmethod + def requires() -> luigi.Task: + """Define dependency.""" + return TopicFilterTask() + def output(self) -> luigi.LocalTarget: """Define output folder.""" output_dir = Path(self.input().path).parent / "filtered" @@ -355,7 +303,7 @@ def run(self) -> None: output_dir.mkdir(exist_ok=True) - if self.source == "pubmed": + if GlobalParams().source == "pubmed": # Find all input files (.xml.gz) all_input_files = [Path(p) for p in filtering["path"].unique()] @@ -396,7 +344,6 @@ def create_symlink(path): accepted.apply(create_symlink) -@requires(PerformFilteringTask) class ConvertPDFTask(ExternalProgramTask): """Convert PDFs to XMLs. @@ -407,6 +354,11 @@ class ConvertPDFTask(ExternalProgramTask): grobid_host = luigi.Parameter() grobid_port = luigi.IntParameter() + @staticmethod + def requires() -> luigi.Task: + """Define dependency.""" + return PerformFilteringTask() + def program_args(self) -> list[str]: """Define subprocess arguments.""" input_dir = Path(self.input().path).parent / "filtered" @@ -431,7 +383,6 @@ def output(self) -> luigi.LocalTarget: return luigi.LocalTarget(str(output_file)) -@inherits(ConvertPDFTask, PerformFilteringTask) class ParseTask(ExternalProgramTask): """Parse articles. @@ -439,12 +390,13 @@ class ParseTask(ExternalProgramTask): `source="arxiv"` `converted_pdfs/`). """ - def requires(self) -> luigi.Task: + @staticmethod + def requires() -> luigi.Task: """Define conditional dependencies.""" - if self.source == "arxiv": - return self.clone(ConvertPDFTask) + if GlobalParams().source == "arxiv": + return ConvertPDFTask() else: - return self.clone(PerformFilteringTask) + return PerformFilteringTask() def output(self) -> luigi.LocalTarget: """Define output folder.""" @@ -469,7 +421,7 @@ def program_args(self) -> list[str]: "pmc": "jats-xml", "pubmed": "pubmed-xml-set", } - parser = source2parser[self.source] + parser = source2parser[GlobalParams().source] command = [ *BBS_BINARY, @@ -483,7 +435,6 @@ def program_args(self) -> list[str]: return command -@requires(ParseTask) class AddTask(ExternalProgramTask): """Add parsed articles to the database. @@ -494,6 +445,11 @@ class AddTask(ExternalProgramTask): db_url = luigi.Parameter() db_type = luigi.Parameter() + @staticmethod + def requires() -> luigi.Task: + """Define dependency.""" + return ParseTask() + def complete(self) -> bool: """Check if all articles inside of `parsed/` are in the database.""" # If all the articles are inside @@ -542,45 +498,12 @@ def program_args(self) -> list[str]: return command -def get_all_dependencies(task: luigi.Task) -> set[luigi.Task]: - """Get all dependencies of a given task. - - Parameters - ---------- - task - Input task - - Returns - ------- - set[luigi.Task] - All the tasks that the `input` depends on including itself. - """ - current_deps = set(task.deps()) - if not current_deps: - return set() - - else: - deps = {task} - for current_dep in current_deps: - deps |= get_all_dependencies(current_dep) - - return deps | current_deps - - def run( *, - source: str, - from_month: str, - filter_config: Path, - output_dir: Path, - db_url: str, - db_type: str, - mesh_topic_db: Path | None, dry_run: bool, - grobid_host: str | None, - grobid_port: int | None, - identifier: str | None, - final_task: str | None, + final_task: str | None = None, + config_path: Path | None = None, + luigi_config: str | None = None, ) -> int: """Run overall pipeline. @@ -598,36 +521,28 @@ def run( ParseTask.capture_output = CAPTURE_OUTPUT AddTask.capture_output = CAPTURE_OUTPUT - add_task_inst = AddTask( - source=source, - from_month=from_month, - filter_config=str(filter_config), - output_dir=str(output_dir), - mesh_topic_db=str(mesh_topic_db), - grobid_host=grobid_host, - grobid_port=grobid_port, - db_url=db_url, - db_type=db_type, - identifier=identifier, - ) - if final_task is None: - selected_task_inst = add_task_inst - else: - all_dependencies = get_all_dependencies(add_task_inst) - all_dependencies_map = {t.__class__.__name__: t for t in all_dependencies} + if config_path: + if not pathlib.Path(config_path).exists(): + raise ValueError(f"The configuration path {config_path} does not exist!") - if final_task in all_dependencies_map: - selected_task_inst = all_dependencies_map[final_task] - else: - raise ValueError(f"Unrecognized final task {final_task}") + config = luigi.configuration.get_config() + config.add_config_path(config_path) + config.reload() - luigi_kwargs = { - "tasks": [selected_task_inst], - } - if dry_run: - print(print_tree(selected_task_inst, last=False)) + if luigi_config: + config = luigi.configuration.get_config() + for param in luigi_config.split(","): + change = re.split(r"[.:]", param, maxsplit=3) + config.set(*change) + + if final_task: + final_task_call = globals()[final_task] else: + final_task_call = AddTask - luigi.build(**luigi_kwargs) + if dry_run: + print(print_tree(final_task_call(), last=False)) + else: + luigi.build([final_task_call()]) return 0 diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 2e74c2a09..0d31de22e 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -17,7 +17,6 @@ import argparse import inspect -import pathlib from subprocess import Popen from unittest.mock import Mock @@ -26,39 +25,24 @@ from bluesearch.entrypoint.database import run RUN_PARAMS = { - "source", - "from_month", - "filter_config", - "output_dir", - "db_url", - "db_type", - "mesh_topic_db", - "dry_run", - "grobid_host", - "grobid_port", - "identifier", "final_task", + "config_path", + "luigi_config", + "dry_run", } def test_init_parser(): parser = run.init_parser(argparse.ArgumentParser()) - args = parser.parse_args( - [ - "--source=arxiv", - "--from-month=2021-12", - "--filter-config=/path/to/config.jsonl", - "--output-dir=some/output/dir", - "--db-url=some.url", - ] - ) + args = parser.parse_args([]) assert vars(args).keys() == RUN_PARAMS - # Test the values - assert args.source == "arxiv" - assert args.from_month == "2021-12" - assert args.filter_config == pathlib.Path("/path/to/config.jsonl") + # # Test the values + assert args.final_task is None + assert args.luigi_config is None + assert args.dry_run is False + assert args.config_path is None def test_run_arguments(): @@ -129,18 +113,9 @@ def test_run_arguments(): ) def test_pipelines(source, tasks, tmp_path, capsys): run.run( - source=source, - from_month="whatever", - filter_config=pathlib.Path("whatever"), - output_dir=tmp_path, + luigi_config=f"GlobalParams.source:{source}," + f"DownloadTask.output_dir:{tmp_path}", dry_run=True, - mesh_topic_db=pathlib.Path("whatever"), - grobid_host="whatever", - grobid_port=1234, - db_url="whatever", - db_type="sqlite", - identifier=None, - final_task=None, ) captured = capsys.readouterr() @@ -211,18 +186,10 @@ def create_output(args, **kwargs): monkeypatch.setattr(run.AddTask, "complete", lambda _: False) run.run( - source=source, - from_month="1234-11", - filter_config=pathlib.Path("aa"), - output_dir=tmp_path, + luigi_config=f"GlobalParams.source:{source}," + f"DownloadTask.output_dir:{tmp_path}," + f"DownloadTask.identifier:{identifier}", dry_run=False, - mesh_topic_db=pathlib.Path("whatever"), - grobid_host="112431321", - grobid_port=8000, - db_url="whatever", - db_type="sqlite", - identifier=identifier, - final_task="AddTask", ) assert (root_dir / "raw").exists() if source == "pmc": From bd77681a87f42973760646f19330e053e4b1b712 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Fri, 18 Mar 2022 08:51:18 +0100 Subject: [PATCH 77/78] Fix linting and add header luigi.cfg --- luigi.cfg | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/luigi.cfg b/luigi.cfg index 5f9baf652..ee1858a82 100644 --- a/luigi.cfg +++ b/luigi.cfg @@ -1,27 +1,44 @@ +;Blue Brain Search is a text mining toolbox focused on scientific use cases. +; +;Copyright (C) 2020 Blue Brain Project, EPFL. +; +;This program is free software: you can redistribute it and/or modify +;it under the terms of the GNU Lesser General Public License as published by +;the Free Software Foundation, either version 3 of the License, or +;(at your option) any later version. +; +;This program is distributed in the hope that it will be useful, +;but WITHOUT ANY WARRANTY; without even the implied warranty of +;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;GNU Lesser General Public License for more details. +; +;You should have received a copy of the GNU Lesser General Public License +;along with this program. If not, see . + [core] - autoload_range=true - log_level = INFO - local_scheduler = True +autoload_range = true +log_level = INFO +local_scheduler = True [GlobalParams] - source=pubmed +source = pubmed [DownloadTask] - from_month=2021-12 - output_dir=luigi-pipeline - identifier= - ; emtpy string is considered default value +from_month = 2021-12 +output_dir = luigi-pipeline +identifier = +; emtpy string is considered default value [TopicExtractTask] - mesh_topic_db=luigi-pipeline/mesh_topic_db.json +mesh_topic_db = luigi-pipeline/mesh_topic_db.json [TopicFilterTask] - filter_config=luigi-pipeline/filter-config.jsonl +filter_config = luigi-pipeline/filter-config.jsonl [ConvertPDFTask] - grobid_host=0.0.0.0 - grobid_port=8070 +grobid_host = 0.0.0.0 +grobid_port = 8070 [AddTask] - db_url=luigi-pipeline/my-db.db - db_type=sqlite \ No newline at end of file +db_url = luigi-pipeline/my-db.db +db_type = sqlite From 0d343d3af274de31c082bb00259dd1f9a70864f1 Mon Sep 17 00:00:00 2001 From: Emilie Delattre Date: Fri, 18 Mar 2022 09:44:06 +0100 Subject: [PATCH 78/78] Add more info about run arguments --- src/bluesearch/entrypoint/database/run.py | 32 ++++++++++++++-------- tests/unit/entrypoint/database/test_run.py | 12 ++++---- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/bluesearch/entrypoint/database/run.py b/src/bluesearch/entrypoint/database/run.py index 5bfb3361a..d2414331c 100644 --- a/src/bluesearch/entrypoint/database/run.py +++ b/src/bluesearch/entrypoint/database/run.py @@ -70,14 +70,20 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: help="Final task of the luigi pipeline.", ) parser.add_argument( - "--config-path", + "--luigi-config-path", type=Path, - help="Configuration Path.", + help="Path to Luigi configuration file. By default, " + "luigi is looking into: /etc/luigi/luigi.cfg, luigi.cfg" + "and the environment variable LUIGI_CONFIG_PATH." + "If a path is specified, it is the one used.", ) parser.add_argument( - "--luigi-config", + "--luigi-config-args", type=str, - help="Configuration parameters.", + help="Comma separated key-value arguments for Luigi configuration, " + "e.g. '--luigi-config GlobalParams.source:arxiv," + "DownloadTask.from-month:2021-04'. Overwrites the content of Luigi " + "configuration file (see --luigi-config-path).", ) parser.add_argument( "--dry-run", @@ -502,8 +508,8 @@ def run( *, dry_run: bool, final_task: str | None = None, - config_path: Path | None = None, - luigi_config: str | None = None, + luigi_config_path: Path | None = None, + luigi_config_args: str | None = None, ) -> int: """Run overall pipeline. @@ -521,17 +527,19 @@ def run( ParseTask.capture_output = CAPTURE_OUTPUT AddTask.capture_output = CAPTURE_OUTPUT - if config_path: - if not pathlib.Path(config_path).exists(): - raise ValueError(f"The configuration path {config_path} does not exist!") + if luigi_config_path: + if not pathlib.Path(luigi_config_path).exists(): + raise ValueError( + f"The configuration path {luigi_config_path} " f"does not exist!" + ) config = luigi.configuration.get_config() - config.add_config_path(config_path) + config.add_config_path(luigi_config_path) config.reload() - if luigi_config: + if luigi_config_args: config = luigi.configuration.get_config() - for param in luigi_config.split(","): + for param in luigi_config_args.split(","): change = re.split(r"[.:]", param, maxsplit=3) config.set(*change) diff --git a/tests/unit/entrypoint/database/test_run.py b/tests/unit/entrypoint/database/test_run.py index 0d31de22e..edfee0786 100644 --- a/tests/unit/entrypoint/database/test_run.py +++ b/tests/unit/entrypoint/database/test_run.py @@ -26,8 +26,8 @@ RUN_PARAMS = { "final_task", - "config_path", - "luigi_config", + "luigi_config_path", + "luigi_config_args", "dry_run", } @@ -40,9 +40,9 @@ def test_init_parser(): # # Test the values assert args.final_task is None - assert args.luigi_config is None + assert args.luigi_config_args is None assert args.dry_run is False - assert args.config_path is None + assert args.luigi_config_path is None def test_run_arguments(): @@ -113,7 +113,7 @@ def test_run_arguments(): ) def test_pipelines(source, tasks, tmp_path, capsys): run.run( - luigi_config=f"GlobalParams.source:{source}," + luigi_config_args=f"GlobalParams.source:{source}," f"DownloadTask.output_dir:{tmp_path}", dry_run=True, ) @@ -186,7 +186,7 @@ def create_output(args, **kwargs): monkeypatch.setattr(run.AddTask, "complete", lambda _: False) run.run( - luigi_config=f"GlobalParams.source:{source}," + luigi_config_args=f"GlobalParams.source:{source}," f"DownloadTask.output_dir:{tmp_path}," f"DownloadTask.identifier:{identifier}", dry_run=False,