Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Commit

Permalink
Add the TopicInfo data structure (#542)
Browse files Browse the repository at this point in the history
* Add the ArticleSource enum
* Add the TopicInfo class

Co-authored-by: Francesco Casalegno <[email protected]>
  • Loading branch information
Stannislav and FrancescoCasalegno authored Jan 19, 2022
1 parent ff190b8 commit 5ed9701
Show file tree
Hide file tree
Showing 11 changed files with 289 additions and 122 deletions.
1 change: 1 addition & 0 deletions docs/source/api/bluesearch.database.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Submodules
bluesearch.database.mining_cache
bluesearch.database.pdf
bluesearch.database.topic
bluesearch.database.topic_info

Module contents
---------------
Expand Down
7 changes: 7 additions & 0 deletions docs/source/api/bluesearch.database.topic_info.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bluesearch.database.topic\_info module
======================================

.. automodule:: bluesearch.database.topic_info
:members:
:undoc-members:
:show-inheritance:
2 changes: 2 additions & 0 deletions docs/source/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Legend

Latest
======
- |Add| the :code:`bluesearch.database.topic_info.TopicInfo` class
- |Add| the :code:`bluesearch.database.article.ArticleSource` enum class
- |Add| extraction of journal and article topics for :code:`arxiv` papers
through CLI command :code:`bbs_database topic-extract arxiv`.
- |Add| extraction of journal and article topics for :code:`pubmed` papers
Expand Down
12 changes: 12 additions & 0 deletions src/bluesearch/database/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""Abstraction of scientific article data and related tools."""
from __future__ import annotations

import enum
import html
import re
import string
Expand All @@ -33,6 +34,17 @@
from bluesearch.database.identifiers import generate_uid


class ArticleSource(enum.Enum):
"""The source of an article."""

ARXIV = "arxiv"
BIORXIV = "biorxiv"
MEDRXIV = "medrxiv"
PMC = "pmc"
PUBMED = "pubmed"
UNKNOWN = "unknown"


def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str:
"""Compute arXiv ID, including version, from file path.
Expand Down
2 changes: 1 addition & 1 deletion src/bluesearch/database/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def extract_article_topics_from_medrxiv_article(
Returns
-------
topic : pathlib.Path or str
topic : str
The subject area of the article.
journal : str
The journal the article was published in. Should be either
Expand Down
121 changes: 121 additions & 0 deletions src/bluesearch/database/topic_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the TopicInfo data structure."""
from __future__ import annotations

import copy
import datetime
import pathlib
from dataclasses import dataclass, field
from typing import Any

import bluesearch
from bluesearch.database.article import ArticleSource


@dataclass
class TopicInfo:
"""The topic information extracted from a journal article.
For the spec see the following GitHub issue/comment:
https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160
"""

source: ArticleSource
path: str | pathlib.Path
element_in_file: int | None = None
article_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict)

def __post_init__(self) -> None:
"""Run the post-initialization."""
self.creation_date = datetime.datetime.now()
self.path = pathlib.Path(self.path).resolve()

@staticmethod
def _add_topics(
mapping: dict[str, list[str]], kind: str, topics: list[str]
) -> None:
"""Add topics to a mapping with collection of topics.
Parameters
----------
mapping
A mapping of the form kind -> list-of-topics that shall be
updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``.
kind
The topic kind. Corresponds to a key in ``mapping``.
topics
The topics to add. Corresponds to a value in ``mapping``.
"""
updated_topics = mapping.get(kind, []) + topics
mapping[kind] = sorted(set(updated_topics))

def add_article_topics(self, kind: str, topics: list[str]) -> None:
"""Add article topics.
Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self._add_topics(self.article_topics, kind, topics)

def add_journal_topics(self, kind: str, topics: list[str]) -> None:
"""Add journal topics.
Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self._add_topics(self.journal_topics, kind, topics)

def json(self) -> dict:
"""Convert the contents of this class to a structured dictionary.
Apart from the source, path and topic entries a "metadata" top-level
key will be added containing a dictionary with entries "created-date"
and "bbs-version".
Returns
-------
dict
The structure dictionary with all topic information.
"""
metadata: dict[str, Any] = {
"created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"),
"bbs-version": bluesearch.__version__,
}
if self.element_in_file is not None:
metadata["element_in_file"] = self.element_in_file

json = {
"source": self.source.value,
"path": str(self.path),
"topics": {
"article": copy.deepcopy(self.article_topics),
"journal": copy.deepcopy(self.journal_topics),
},
"metadata": metadata,
}

return json
27 changes: 15 additions & 12 deletions src/bluesearch/entrypoint/database/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@
from itertools import chain
from pathlib import Path

from bluesearch.database.article import ArticleSource

logger = logging.getLogger(__name__)

# Data conventions and formats are different prior to these dates. We
# download only if the starting date is more recent or equal to the
# respective threshold.
MIN_DATE = {
# https://arxiv.org/help/arxiv_identifier#old
"arxiv": datetime(2007, 4, 1),
ArticleSource.ARXIV: datetime(2007, 4, 1),
# https://www.biorxiv.org/tdm + looked into Current Content folder on GPFS
"biorxiv": datetime(2018, 12, 1),
ArticleSource.BIORXIV: datetime(2018, 12, 1),
# https://www.medrxiv.org/tdm + looked into Current Content folder on GPFS
"medrxiv": datetime(2020, 10, 1),
ArticleSource.MEDRXIV: datetime(2020, 10, 1),
# This should change every year in December:
# see https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/
"pmc": datetime(2021, 12, 1),
ArticleSource.PMC: datetime(2021, 12, 1),
# This should change every year in December:
# see https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
"pubmed": datetime(2021, 12, 1),
ArticleSource.PUBMED: datetime(2021, 12, 1),
}


Expand Down Expand Up @@ -88,7 +90,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument(
"source",
type=str,
choices=("arxiv", "biorxiv", "medrxiv", "pmc", "pubmed"),
choices=[member.value for member in ArticleSource],
help="Source of the download.",
)
parser.add_argument(
Expand Down Expand Up @@ -129,16 +131,17 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
get_s3_urls,
)

if from_month < MIN_DATE[source]:
article_source = ArticleSource(source)
if from_month < MIN_DATE[article_source]:
logger.error(
f"The papers from before {MIN_DATE[source].strftime('%B %Y')} "
f"The papers from before {MIN_DATE[article_source].strftime('%B %Y')} "
"follow a different format and can't be downloaded. "
"Please contact the developers if you need them. "
"To proceed please re-run the command with a different starting month."
)
return 1

if source == "pmc":
if article_source == ArticleSource.PMC:
url_dict = {}
for component in {"author_manuscript", "oa_comm", "oa_noncomm"}:
url_dict[component] = generate_pmc_urls(component, from_month)
Expand All @@ -158,7 +161,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
component_dir.mkdir(exist_ok=True, parents=True)
download_articles(url_list, component_dir)
return 0
elif source == "pubmed":
elif article_source == ArticleSource.PUBMED:
url_list = get_pubmed_urls(from_month)
if dry_run:
print("URL requests from:")
Expand All @@ -169,7 +172,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
output_dir.mkdir(exist_ok=True, parents=True)
download_articles(url_list, output_dir)
return 0
elif source in {"biorxiv", "medrxiv"}:
elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:

key_id = getpass.getpass("aws_access_key_id: ")
secret_access_key = getpass.getpass("aws_secret_access_key: ")
Expand All @@ -192,7 +195,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
logger.info(f"Start downloading {source} papers.")
download_s3_articles(bucket, url_dict, output_dir)
return 0
elif source == "arxiv":
elif article_source == ArticleSource.ARXIV:
logger.info("Loading libraries")
from google.cloud.storage import Client

Expand Down
Loading

0 comments on commit 5ed9701

Please sign in to comment.