diff --git a/docs/source/api/bluesearch.database.rst b/docs/source/api/bluesearch.database.rst index 45a7e8f2e..73eb468dc 100644 --- a/docs/source/api/bluesearch.database.rst +++ b/docs/source/api/bluesearch.database.rst @@ -14,6 +14,7 @@ Submodules bluesearch.database.mining_cache bluesearch.database.pdf bluesearch.database.topic + bluesearch.database.topic_info Module contents --------------- diff --git a/docs/source/api/bluesearch.database.topic_info.rst b/docs/source/api/bluesearch.database.topic_info.rst new file mode 100644 index 000000000..dff7075e0 --- /dev/null +++ b/docs/source/api/bluesearch.database.topic_info.rst @@ -0,0 +1,7 @@ +bluesearch.database.topic\_info module +====================================== + +.. automodule:: bluesearch.database.topic_info + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/whatsnew.rst b/docs/source/whatsnew.rst index 07c2b580f..6981547a5 100644 --- a/docs/source/whatsnew.rst +++ b/docs/source/whatsnew.rst @@ -29,6 +29,8 @@ Legend Latest ====== +- |Add| the :code:`bluesearch.database.topic_info.TopicInfo` class +- |Add| the :code:`bluesearch.database.article.ArticleSource` enum class - |Add| extraction of journal and article topics for :code:`arxiv` papers through CLI command :code:`bbs_database topic-extract arxiv`. - |Add| extraction of journal and article topics for :code:`pubmed` papers diff --git a/src/bluesearch/database/article.py b/src/bluesearch/database/article.py index a374b7e5f..e7880ed4b 100644 --- a/src/bluesearch/database/article.py +++ b/src/bluesearch/database/article.py @@ -17,6 +17,7 @@ """Abstraction of scientific article data and related tools.""" from __future__ import annotations +import enum import html import re import string @@ -33,6 +34,17 @@ from bluesearch.database.identifiers import generate_uid +class ArticleSource(enum.Enum): + """The source of an article.""" + + ARXIV = "arxiv" + BIORXIV = "biorxiv" + MEDRXIV = "medrxiv" + PMC = "pmc" + PUBMED = "pubmed" + UNKNOWN = "unknown" + + def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str: """Compute arXiv ID, including version, from file path. diff --git a/src/bluesearch/database/topic.py b/src/bluesearch/database/topic.py index f13a44c4e..ecb79340e 100644 --- a/src/bluesearch/database/topic.py +++ b/src/bluesearch/database/topic.py @@ -402,7 +402,7 @@ def extract_article_topics_from_medrxiv_article( Returns ------- - topic : pathlib.Path or str + topic : str The subject area of the article. journal : str The journal the article was published in. Should be either diff --git a/src/bluesearch/database/topic_info.py b/src/bluesearch/database/topic_info.py new file mode 100644 index 000000000..10bf67628 --- /dev/null +++ b/src/bluesearch/database/topic_info.py @@ -0,0 +1,121 @@ +# Blue Brain Search is a text mining toolbox focused on scientific use cases. +# +# Copyright (C) 2020 Blue Brain Project, EPFL. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +"""Implementation of the TopicInfo data structure.""" +from __future__ import annotations + +import copy +import datetime +import pathlib +from dataclasses import dataclass, field +from typing import Any + +import bluesearch +from bluesearch.database.article import ArticleSource + + +@dataclass +class TopicInfo: + """The topic information extracted from a journal article. + + For the spec see the following GitHub issue/comment: + https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160 + """ + + source: ArticleSource + path: str | pathlib.Path + element_in_file: int | None = None + article_topics: dict[str, list[str]] = field(init=False, default_factory=dict) + journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict) + + def __post_init__(self) -> None: + """Run the post-initialization.""" + self.creation_date = datetime.datetime.now() + self.path = pathlib.Path(self.path).resolve() + + @staticmethod + def _add_topics( + mapping: dict[str, list[str]], kind: str, topics: list[str] + ) -> None: + """Add topics to a mapping with collection of topics. + + Parameters + ---------- + mapping + A mapping of the form kind -> list-of-topics that shall be + updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``. + kind + The topic kind. Corresponds to a key in ``mapping``. + topics + The topics to add. Corresponds to a value in ``mapping``. + """ + updated_topics = mapping.get(kind, []) + topics + mapping[kind] = sorted(set(updated_topics)) + + def add_article_topics(self, kind: str, topics: list[str]) -> None: + """Add article topics. + + Parameters + ---------- + kind + The topic kind. For example "MeSH" or "MAG". + topics + A list of the topics to add. + """ + self._add_topics(self.article_topics, kind, topics) + + def add_journal_topics(self, kind: str, topics: list[str]) -> None: + """Add journal topics. + + Parameters + ---------- + kind + The topic kind. For example "MeSH" or "MAG". + topics + A list of the topics to add. + """ + self._add_topics(self.journal_topics, kind, topics) + + def json(self) -> dict: + """Convert the contents of this class to a structured dictionary. + + Apart from the source, path and topic entries a "metadata" top-level + key will be added containing a dictionary with entries "created-date" + and "bbs-version". + + Returns + ------- + dict + The structure dictionary with all topic information. + """ + metadata: dict[str, Any] = { + "created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"), + "bbs-version": bluesearch.__version__, + } + if self.element_in_file is not None: + metadata["element_in_file"] = self.element_in_file + + json = { + "source": self.source.value, + "path": str(self.path), + "topics": { + "article": copy.deepcopy(self.article_topics), + "journal": copy.deepcopy(self.journal_topics), + }, + "metadata": metadata, + } + + return json diff --git a/src/bluesearch/entrypoint/database/download.py b/src/bluesearch/entrypoint/database/download.py index c3143789f..d25914ffd 100644 --- a/src/bluesearch/entrypoint/database/download.py +++ b/src/bluesearch/entrypoint/database/download.py @@ -23,6 +23,8 @@ from itertools import chain from pathlib import Path +from bluesearch.database.article import ArticleSource + logger = logging.getLogger(__name__) # Data conventions and formats are different prior to these dates. We @@ -30,17 +32,17 @@ # respective threshold. MIN_DATE = { # https://arxiv.org/help/arxiv_identifier#old - "arxiv": datetime(2007, 4, 1), + ArticleSource.ARXIV: datetime(2007, 4, 1), # https://www.biorxiv.org/tdm + looked into Current Content folder on GPFS - "biorxiv": datetime(2018, 12, 1), + ArticleSource.BIORXIV: datetime(2018, 12, 1), # https://www.medrxiv.org/tdm + looked into Current Content folder on GPFS - "medrxiv": datetime(2020, 10, 1), + ArticleSource.MEDRXIV: datetime(2020, 10, 1), # This should change every year in December: # see https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/ - "pmc": datetime(2021, 12, 1), + ArticleSource.PMC: datetime(2021, 12, 1), # This should change every year in December: # see https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/ - "pubmed": datetime(2021, 12, 1), + ArticleSource.PUBMED: datetime(2021, 12, 1), } @@ -88,7 +90,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument( "source", type=str, - choices=("arxiv", "biorxiv", "medrxiv", "pmc", "pubmed"), + choices=[member.value for member in ArticleSource], help="Source of the download.", ) parser.add_argument( @@ -129,16 +131,17 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i get_s3_urls, ) - if from_month < MIN_DATE[source]: + article_source = ArticleSource(source) + if from_month < MIN_DATE[article_source]: logger.error( - f"The papers from before {MIN_DATE[source].strftime('%B %Y')} " + f"The papers from before {MIN_DATE[article_source].strftime('%B %Y')} " "follow a different format and can't be downloaded. " "Please contact the developers if you need them. " "To proceed please re-run the command with a different starting month." ) return 1 - if source == "pmc": + if article_source == ArticleSource.PMC: url_dict = {} for component in {"author_manuscript", "oa_comm", "oa_noncomm"}: url_dict[component] = generate_pmc_urls(component, from_month) @@ -158,7 +161,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i component_dir.mkdir(exist_ok=True, parents=True) download_articles(url_list, component_dir) return 0 - elif source == "pubmed": + elif article_source == ArticleSource.PUBMED: url_list = get_pubmed_urls(from_month) if dry_run: print("URL requests from:") @@ -169,7 +172,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i output_dir.mkdir(exist_ok=True, parents=True) download_articles(url_list, output_dir) return 0 - elif source in {"biorxiv", "medrxiv"}: + elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: key_id = getpass.getpass("aws_access_key_id: ") secret_access_key = getpass.getpass("aws_secret_access_key: ") @@ -192,7 +195,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i logger.info(f"Start downloading {source} papers.") download_s3_articles(bucket, url_dict, output_dir) return 0 - elif source == "arxiv": + elif article_source == ArticleSource.ARXIV: logger.info("Loading libraries") from google.cloud.storage import Client diff --git a/src/bluesearch/entrypoint/database/topic_extract.py b/src/bluesearch/entrypoint/database/topic_extract.py index e2d4bb2de..1368147ae 100644 --- a/src/bluesearch/entrypoint/database/topic_extract.py +++ b/src/bluesearch/entrypoint/database/topic_extract.py @@ -18,11 +18,12 @@ from __future__ import annotations import argparse -import datetime import logging from pathlib import Path from typing import Any +from bluesearch.database.article import ArticleSource + logger = logging.getLogger(__name__) @@ -44,14 +45,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser.add_argument( "source", - type=str, - choices=( - "arxiv", - "biorxiv", - "medrxiv", - "pmc", - "pubmed", - ), + choices=[member.value for member in ArticleSource], help=""" Format of the input. If extracting topic of several articles, all articles must have the same format. @@ -129,7 +123,6 @@ def run( """ from defusedxml import ElementTree - import bluesearch from bluesearch.database.topic import ( extract_article_topics_for_pubmed_article, extract_article_topics_from_medrxiv_article, @@ -137,6 +130,7 @@ def run( get_topics_for_arxiv_articles, get_topics_for_pmc_article, ) + from bluesearch.database.topic_info import TopicInfo from bluesearch.utils import JSONL, find_files try: @@ -153,102 +147,45 @@ def run( print(*inputs, sep="\n") return 0 + article_source = ArticleSource(source) all_results: list[dict[str, Any]] = [] - - if source == "pmc": + if article_source is ArticleSource.PMC: for path in inputs: logger.info(f"Processing {path}") + topic_info = TopicInfo(source=article_source, path=path.resolve()) journal_topics = get_topics_for_pmc_article(path) - all_results.append( - { - "source": "pmc", - "path": str(path.resolve()), - "topics": { - "journal": { - "MeSH": journal_topics, - }, - }, - "metadata": { - "created-date": datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - "bbs-version": bluesearch.version.__version__, - }, - } - ) - - elif source == "arxiv": - all_results = [ - { - "source": "arxiv", - "path": str(path.resolve()), - "topics": { - "article": { - "arXiv": article_topics, - }, - }, - "metadata": { - "created-date": datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - "bbs-version": bluesearch.version.__version__, - }, - } - for path, article_topics in get_topics_for_arxiv_articles(inputs).items() - ] - - elif source in {"biorxiv", "medrxiv"}: - for path in inputs: - logger.info(f"Processing {path}") - topic, journal = extract_article_topics_from_medrxiv_article(path) - all_results.append( - { - "source": journal, - "path": str(path.resolve()), - "topics": { - "article": { - "Subject Area": topic, - }, - }, - "metadata": { - "created-date": datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - "bbs-version": bluesearch.version.__version__, - }, - } - ) - - pass - - elif source == "pubmed": + if journal_topics: + topic_info.add_journal_topics("MeSH", journal_topics) + all_results.append(topic_info.json()) + elif article_source is ArticleSource.PUBMED: for path in inputs: logger.info(f"Processing {path}") articles = ElementTree.parse(input_path) for i, article in enumerate(articles.iter("PubmedArticle")): + topic_info = TopicInfo( + source=article_source, + path=path.resolve(), + element_in_file=i, + ) article_topics = extract_article_topics_for_pubmed_article(article) journal_topics = extract_journal_topics_for_pubmed_article(article) - all_results.append( - { - "source": "pubmed", - "path": str(path.resolve()), - "topics": { - "journal": { - "MeSH": journal_topics, - }, - "article": { - "MeSH": article_topics, - }, - }, - "metadata": { - "created-date": datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - "bbs-version": bluesearch.version.__version__, - "element_in_file": i, - }, - } - ) + if article_topics: + topic_info.add_article_topics("MeSH", article_topics) + if journal_topics: + topic_info.add_journal_topics("MeSH", journal_topics) + all_results.append(topic_info.json()) + elif article_source is ArticleSource.ARXIV: + for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): + topic_info = TopicInfo(source=article_source, path=path) + topic_info.add_article_topics("arXiv", article_topics) + all_results.append(topic_info.json()) + elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: + for path in inputs: + logger.info(f"Processing {path}") + topic, journal = extract_article_topics_from_medrxiv_article(path) + topic_info = TopicInfo(source=ArticleSource(journal), path=path) + topic_info.add_article_topics("Subject Area", [topic]) + all_results.append(topic_info.json()) else: logger.error(f"The source type {source!r} is not implemented yet") return 1 diff --git a/tests/unit/database/test_topic_info.py b/tests/unit/database/test_topic_info.py new file mode 100644 index 000000000..1f57b2c71 --- /dev/null +++ b/tests/unit/database/test_topic_info.py @@ -0,0 +1,84 @@ +import datetime +import pathlib + +import pytest + +import bluesearch +from bluesearch.database.article import ArticleSource +from bluesearch.database.topic_info import TopicInfo + + +class TestTopicInfo: + def test_instantiation(self): + source = ArticleSource.ARXIV + path = pathlib.Path("/some/path.test") + topic_info = TopicInfo(source, path) + + assert topic_info.source == source + assert topic_info.path == path + + def test_relative_path_is_resolved(self): + source = ArticleSource.ARXIV + path = pathlib.Path("relative/path") + topic_info = TopicInfo(source, path) + + assert topic_info.source == source + assert topic_info.path == pathlib.Path.cwd() / path + + @pytest.mark.parametrize( + ("mapping", "kind", "topics", "result"), + ( + ({}, "MeSH", ["topic 1"], {"MeSH": ["topic 1"]}), + ( + {"MeSH": ["topic 2"]}, + "MeSH", + ["topic 1"], + {"MeSH": ["topic 1", "topic 2"]}, + ), + ({"MeSH": ["topic 1"]}, "MeSH", ["topic 1"], {"MeSH": ["topic 1"]}), + ), + ) + def test_add_topics(self, mapping, kind, topics, result): + TopicInfo._add_topics(mapping, kind, topics) + assert mapping == result + + def test_add_article_journal_topics(self): + topic_info = TopicInfo(ArticleSource.UNKNOWN, "") + topic_info.add_article_topics("MeSH", ["AT 1", "AT 2", "AT 3"]) + topic_info.add_journal_topics("MAP", ["JT 1", "JT 2"]) + + assert topic_info.article_topics == {"MeSH": ["AT 1", "AT 2", "AT 3"]} + assert topic_info.journal_topics == {"MAP": ["JT 1", "JT 2"]} + + def test_json(self): + start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + topic_info = TopicInfo( + source=ArticleSource.PUBMED, + path=pathlib.Path("/some/path.test"), + element_in_file=5, + ) + topic_info.add_article_topics("MeSH", ["AT 1", "AT 2", "AT 3"]) + topic_info.add_journal_topics("MAP", ["JT 1", "JT 2"]) + + end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + json = topic_info.json() + metadata = json.pop("metadata") + assert json == { + "source": ArticleSource.PUBMED.value, + "path": "/some/path.test", + "topics": { + "article": {"MeSH": ["AT 1", "AT 2", "AT 3"]}, + "journal": {"MAP": ["JT 1", "JT 2"]}, + }, + } + assert start <= metadata["created-date"] <= end + assert metadata["bbs-version"] == bluesearch.__version__ + + def test_element_in_file(self): + json = TopicInfo(ArticleSource.UNKNOWN, "").json() + assert json["metadata"].get("element_in_file") is None + + json = TopicInfo(ArticleSource.UNKNOWN, "", element_in_file=5).json() + assert json["metadata"].get("element_in_file") == 5 diff --git a/tests/unit/entrypoint/database/test_download.py b/tests/unit/entrypoint/database/test_download.py index 0511524ff..a317be77a 100644 --- a/tests/unit/entrypoint/database/test_download.py +++ b/tests/unit/entrypoint/database/test_download.py @@ -26,6 +26,7 @@ import pytest from google.cloud.storage import Blob +from bluesearch.database.article import ArticleSource from bluesearch.entrypoint.database import download DOWNLOAD_PARAMS = {"source", "from_month", "output_dir", "dry_run"} @@ -268,11 +269,11 @@ def test_worker_errors_are_reported(self, caplog, tmp_path, mocked): @pytest.mark.parametrize( ("source", "expected_date"), [ - ("arxiv", "April 2007"), - ("biorxiv", "December 2018"), - ("medrxiv", "October 2020"), - ("pmc", "December 2021"), - ("pubmed", "December 2021"), + (ArticleSource.ARXIV, "April 2007"), + (ArticleSource.BIORXIV, "December 2018"), + (ArticleSource.MEDRXIV, "October 2020"), + (ArticleSource.PMC, "December 2021"), + (ArticleSource.PUBMED, "December 2021"), ], ) def test_structure_change(source, expected_date, tmp_path, caplog): @@ -281,7 +282,7 @@ def test_structure_change(source, expected_date, tmp_path, caplog): fake_datetime = limit_datetime - datetime.timedelta(days=32) with caplog.at_level(logging.ERROR): - exit_code = download.run(source, fake_datetime, tmp_path, dry_run=False) + exit_code = download.run(source.value, fake_datetime, tmp_path, dry_run=False) assert exit_code == 1 assert expected_date in caplog.text diff --git a/tests/unit/entrypoint/database/test_topic_extract.py b/tests/unit/entrypoint/database/test_topic_extract.py index ed9275a36..0a2d26acf 100644 --- a/tests/unit/entrypoint/database/test_topic_extract.py +++ b/tests/unit/entrypoint/database/test_topic_extract.py @@ -73,11 +73,11 @@ def test_input_path_not_correct(caplog): assert "Argument 'input_path'" in caplog.text -def test_wrong_source(test_data_path, caplog, tmp_path): +def test_source_type_not_implemented(test_data_path, caplog, tmp_path): pmc_path = test_data_path / "jats_article.xml" with caplog.at_level(logging.ERROR): exit_code = topic_extract.run( - source="wrong_type", + source="unknown", input_path=pmc_path, output_file=tmp_path, match_filename=None, @@ -86,7 +86,7 @@ def test_wrong_source(test_data_path, caplog, tmp_path): dry_run=False, ) assert exit_code == 1 - assert "The source type" in caplog.text + assert "not implemented" in caplog.text def test_dry_run(test_data_path, capsys, tmp_path): @@ -134,7 +134,6 @@ def test_pmc_source(test_data_path, capsys, monkeypatch, tmp_path): assert result["path"] == str(pmc_path) assert isinstance(result["topics"], dict) topics = result["topics"] - assert "article" not in topics assert "journal" in topics assert isinstance(topics["journal"], dict) assert topics["journal"]["MeSH"] == meshes @@ -177,7 +176,7 @@ def test_medbiorxiv_source(capsys, monkeypatch, tmp_path, source): # Mocking fake_extract_article_topics_from_medrxiv_article = Mock( - side_effect=lambda p: ("TOPIC", "JOURNAL") + side_effect=lambda p: ("TOPIC", source) ) monkeypatch.setattr( @@ -201,8 +200,8 @@ def test_medbiorxiv_source(capsys, monkeypatch, tmp_path, source): result = JSONL.load_jsonl(output_file) assert len(result) == 1 - assert result[0]["source"] == "JOURNAL" - assert result[0]["topics"]["article"]["Subject Area"] == "TOPIC" + assert result[0]["source"] == source + assert result[0]["topics"]["article"]["Subject Area"] == ["TOPIC"] def test_pubmed_source(test_data_path, capsys, monkeypatch, tmp_path):