Add the TopicInfo data structure (#542)

* Add the ArticleSource enum * Add the TopicInfo class Co-authored-by: Francesco Casalegno <[email protected]>
BlueBrain · Jan 19, 2022 · 5ed9701 · 5ed9701
1 parent ff190b8
commit 5ed9701
Show file tree

Hide file tree

Showing 11 changed files with 289 additions and 122 deletions.
diff --git a/docs/source/api/bluesearch.database.rst b/docs/source/api/bluesearch.database.rst
@@ -14,6 +14,7 @@ Submodules
    bluesearch.database.mining_cache
    bluesearch.database.pdf
    bluesearch.database.topic
+   bluesearch.database.topic_info
 
 Module contents
 ---------------

diff --git a/docs/source/api/bluesearch.database.topic_info.rst b/docs/source/api/bluesearch.database.topic_info.rst
@@ -0,0 +1,7 @@
+bluesearch.database.topic\_info module
+======================================
+
+.. automodule:: bluesearch.database.topic_info
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/whatsnew.rst b/docs/source/whatsnew.rst
@@ -29,6 +29,8 @@ Legend
 
 Latest
 ======
+- |Add| the :code:`bluesearch.database.topic_info.TopicInfo` class
+- |Add| the :code:`bluesearch.database.article.ArticleSource` enum class
 - |Add| extraction of journal and article topics for :code:`arxiv` papers
   through CLI command :code:`bbs_database topic-extract arxiv`.
 - |Add| extraction of journal and article topics for :code:`pubmed` papers

diff --git a/src/bluesearch/database/article.py b/src/bluesearch/database/article.py
@@ -17,6 +17,7 @@
 """Abstraction of scientific article data and related tools."""
 from __future__ import annotations
 
+import enum
 import html
 import re
 import string
@@ -33,6 +34,17 @@
 from bluesearch.database.identifiers import generate_uid
 
 
+class ArticleSource(enum.Enum):
+    """The source of an article."""
+
+    ARXIV = "arxiv"
+    BIORXIV = "biorxiv"
+    MEDRXIV = "medrxiv"
+    PMC = "pmc"
+    PUBMED = "pubmed"
+    UNKNOWN = "unknown"
+
+
 def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str:
     """Compute arXiv ID, including version, from file path.
 

diff --git a/src/bluesearch/database/topic.py b/src/bluesearch/database/topic.py
@@ -402,7 +402,7 @@ def extract_article_topics_from_medrxiv_article(
 
     Returns
     -------
-    topic : pathlib.Path or str
+    topic : str
         The subject area of the article.
     journal : str
         The journal the article was published in. Should be either

diff --git a/src/bluesearch/database/topic_info.py b/src/bluesearch/database/topic_info.py
@@ -0,0 +1,121 @@
+# Blue Brain Search is a text mining toolbox focused on scientific use cases.
+#
+# Copyright (C) 2020  Blue Brain Project, EPFL.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""Implementation of the TopicInfo data structure."""
+from __future__ import annotations
+
+import copy
+import datetime
+import pathlib
+from dataclasses import dataclass, field
+from typing import Any
+
+import bluesearch
+from bluesearch.database.article import ArticleSource
+
+
+@dataclass
+class TopicInfo:
+    """The topic information extracted from a journal article.
+
+    For the spec see the following GitHub issue/comment:
+    https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160
+    """
+
+    source: ArticleSource
+    path: str | pathlib.Path
+    element_in_file: int | None = None
+    article_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
+    journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
+
+    def __post_init__(self) -> None:
+        """Run the post-initialization."""
+        self.creation_date = datetime.datetime.now()
+        self.path = pathlib.Path(self.path).resolve()
+
+    @staticmethod
+    def _add_topics(
+        mapping: dict[str, list[str]], kind: str, topics: list[str]
+    ) -> None:
+        """Add topics to a mapping with collection of topics.
+
+        Parameters
+        ----------
+        mapping
+            A mapping of the form kind -> list-of-topics that shall be
+            updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``.
+        kind
+            The topic kind. Corresponds to a key in ``mapping``.
+        topics
+            The topics to add. Corresponds to a value in ``mapping``.
+        """
+        updated_topics = mapping.get(kind, []) + topics
+        mapping[kind] = sorted(set(updated_topics))
+
+    def add_article_topics(self, kind: str, topics: list[str]) -> None:
+        """Add article topics.
+
+        Parameters
+        ----------
+        kind
+            The topic kind. For example "MeSH" or "MAG".
+        topics
+            A list of the topics to add.
+        """
+        self._add_topics(self.article_topics, kind, topics)
+
+    def add_journal_topics(self, kind: str, topics: list[str]) -> None:
+        """Add journal topics.
+
+        Parameters
+        ----------
+        kind
+            The topic kind. For example "MeSH" or "MAG".
+        topics
+            A list of the topics to add.
+        """
+        self._add_topics(self.journal_topics, kind, topics)
+
+    def json(self) -> dict:
+        """Convert the contents of this class to a structured dictionary.
+
+        Apart from the source, path and topic entries a "metadata" top-level
+        key will be added containing a dictionary with entries "created-date"
+        and "bbs-version".
+
+        Returns
+        -------
+        dict
+            The structure dictionary with all topic information.
+        """
+        metadata: dict[str, Any] = {
+            "created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"),
+            "bbs-version": bluesearch.__version__,
+        }
+        if self.element_in_file is not None:
+            metadata["element_in_file"] = self.element_in_file
+
+        json = {
+            "source": self.source.value,
+            "path": str(self.path),
+            "topics": {
+                "article": copy.deepcopy(self.article_topics),
+                "journal": copy.deepcopy(self.journal_topics),
+            },
+            "metadata": metadata,
+        }
+
+        return json
diff --git a/src/bluesearch/entrypoint/database/download.py b/src/bluesearch/entrypoint/database/download.py
@@ -23,24 +23,26 @@
 from itertools import chain
 from pathlib import Path
 
+from bluesearch.database.article import ArticleSource
+
 logger = logging.getLogger(__name__)
 
 # Data conventions and formats are different prior to these dates. We
 # download only if the starting date is more recent or equal to the
 # respective threshold.
 MIN_DATE = {
     # https://arxiv.org/help/arxiv_identifier#old
-    "arxiv": datetime(2007, 4, 1),
+    ArticleSource.ARXIV: datetime(2007, 4, 1),
     # https://www.biorxiv.org/tdm + looked into Current Content folder on GPFS
-    "biorxiv": datetime(2018, 12, 1),
+    ArticleSource.BIORXIV: datetime(2018, 12, 1),
     # https://www.medrxiv.org/tdm + looked into Current Content folder on GPFS
-    "medrxiv": datetime(2020, 10, 1),
+    ArticleSource.MEDRXIV: datetime(2020, 10, 1),
     # This should change every year in December:
     # see https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/
-    "pmc": datetime(2021, 12, 1),
+    ArticleSource.PMC: datetime(2021, 12, 1),
     # This should change every year in December:
     # see https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
-    "pubmed": datetime(2021, 12, 1),
+    ArticleSource.PUBMED: datetime(2021, 12, 1),
 }
 
 
@@ -88,7 +90,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument(
         "source",
         type=str,
-        choices=("arxiv", "biorxiv", "medrxiv", "pmc", "pubmed"),
+        choices=[member.value for member in ArticleSource],
         help="Source of the download.",
     )
     parser.add_argument(
@@ -129,16 +131,17 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
         get_s3_urls,
     )
 
-    if from_month < MIN_DATE[source]:
+    article_source = ArticleSource(source)
+    if from_month < MIN_DATE[article_source]:
         logger.error(
-            f"The papers from before {MIN_DATE[source].strftime('%B %Y')} "
+            f"The papers from before {MIN_DATE[article_source].strftime('%B %Y')} "
             "follow a different format and can't be downloaded. "
             "Please contact the developers if you need them. "
             "To proceed please re-run the command with a different starting month."
         )
         return 1
 
-    if source == "pmc":
+    if article_source == ArticleSource.PMC:
         url_dict = {}
         for component in {"author_manuscript", "oa_comm", "oa_noncomm"}:
             url_dict[component] = generate_pmc_urls(component, from_month)
@@ -158,7 +161,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
             component_dir.mkdir(exist_ok=True, parents=True)
             download_articles(url_list, component_dir)
         return 0
-    elif source == "pubmed":
+    elif article_source == ArticleSource.PUBMED:
         url_list = get_pubmed_urls(from_month)
         if dry_run:
             print("URL requests from:")
@@ -169,7 +172,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
         output_dir.mkdir(exist_ok=True, parents=True)
         download_articles(url_list, output_dir)
         return 0
-    elif source in {"biorxiv", "medrxiv"}:
+    elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
 
         key_id = getpass.getpass("aws_access_key_id: ")
         secret_access_key = getpass.getpass("aws_secret_access_key: ")
@@ -192,7 +195,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
         logger.info(f"Start downloading {source} papers.")
         download_s3_articles(bucket, url_dict, output_dir)
         return 0
-    elif source == "arxiv":
+    elif article_source == ArticleSource.ARXIV:
         logger.info("Loading libraries")
         from google.cloud.storage import Client