Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Add the TopicInfo data structure #542

Merged
merged 26 commits into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f42260f
Add the ArticleSource enum
Jan 10, 2022
7da9448
Add the TopicInfo class
Jan 10, 2022
f3d548d
Use TopicInfo in the topic_extract module
Jan 10, 2022
93beecb
Add tests for TopicInfo
Jan 10, 2022
ed2d403
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 10, 2022
0f2cb62
Fix failing test
Jan 10, 2022
e01b12a
Update topic_extract
Jan 10, 2022
b2712e3
Update whatsnew
Jan 10, 2022
37772aa
Turn the value of ArticleSource.UNKNOWN to a string
Jan 10, 2022
87ec2e2
Use ArticleSource values in topic extract CLI choices
Jan 10, 2022
9f5e831
Update API docs
Jan 10, 2022
458d8f3
Fix formatting
Jan 10, 2022
cba25d0
Streamline iteration over ArticleSource enum
Stannislav Jan 17, 2022
b1df1ae
Improve the handling of the source argument
Jan 17, 2022
827c79d
Remove metadata parameter from TopicInfo
Jan 17, 2022
5145e04
Resolve path in TopicInfo
Jan 17, 2022
fb0d21f
Use ArticleSource in the download command
Jan 17, 2022
0ec90d4
Make TopicInfo.add_topic private
Jan 17, 2022
c9fc877
Mention the "metadata" key in TopicInfo.json() docstring
Jan 17, 2022
2307565
Fix formatting
Jan 17, 2022
5c4d598
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 17, 2022
019cd3a
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 18, 2022
100c4b5
Use TopicInfo in arXiv topic extraction
Jan 18, 2022
1cce0b8
Compare enum members by identity
Jan 18, 2022
de2d831
Use TopicInfo in {bio,med}Rxiv topic extraction
Jan 18, 2022
b3cce77
Remove unused imports
Jan 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/api/bluesearch.database.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Submodules
bluesearch.database.mining_cache
bluesearch.database.pdf
bluesearch.database.topic
bluesearch.database.topic_info

Module contents
---------------
Expand Down
7 changes: 7 additions & 0 deletions docs/source/api/bluesearch.database.topic_info.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bluesearch.database.topic\_info module
======================================

.. automodule:: bluesearch.database.topic_info
:members:
:undoc-members:
:show-inheritance:
2 changes: 2 additions & 0 deletions docs/source/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Legend

Latest
======
- |Add| the :code:`bluesearch.database.topic_info.TopicInfo` class
- |Add| the :code:`bluesearch.database.article.ArticleSource` enum class
- |Add| extraction of journal and article topics for :code:`arxiv` papers
through CLI command :code:`bbs_database topic-extract arxiv`.
- |Add| extraction of journal and article topics for :code:`pubmed` papers
Expand Down
12 changes: 12 additions & 0 deletions src/bluesearch/database/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""Abstraction of scientific article data and related tools."""
from __future__ import annotations

import enum
import html
import re
import string
Expand All @@ -33,6 +34,17 @@
from bluesearch.database.identifiers import generate_uid


class ArticleSource(enum.Enum):
"""The source of an article."""

ARXIV = "arxiv"
BIORXIV = "biorxiv"
MEDRXIV = "medrxiv"
PMC = "pmc"
PUBMED = "pubmed"
UNKNOWN = "unknown"


def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str:
"""Compute arXiv ID, including version, from file path.

Expand Down
2 changes: 1 addition & 1 deletion src/bluesearch/database/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def extract_article_topics_from_medrxiv_article(

Returns
-------
topic : pathlib.Path or str
topic : str
The subject area of the article.
journal : str
The journal the article was published in. Should be either
Expand Down
121 changes: 121 additions & 0 deletions src/bluesearch/database/topic_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the TopicInfo data structure."""
from __future__ import annotations

import copy
import datetime
import pathlib
from dataclasses import dataclass, field
from typing import Any

import bluesearch
from bluesearch.database.article import ArticleSource


@dataclass
class TopicInfo:
"""The topic information extracted from a journal article.

For the spec see the following GitHub issue/comment:
https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160
"""

source: ArticleSource
path: str | pathlib.Path
element_in_file: int | None = None
article_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict)

def __post_init__(self) -> None:
"""Run the post-initialization."""
self.creation_date = datetime.datetime.now()
self.path = pathlib.Path(self.path).resolve()

@staticmethod
def _add_topics(
mapping: dict[str, list[str]], kind: str, topics: list[str]
) -> None:
"""Add topics to a mapping with collection of topics.

Parameters
----------
mapping
A mapping of the form kind -> list-of-topics that shall be
updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``.
kind
The topic kind. Corresponds to a key in ``mapping``.
topics
The topics to add. Corresponds to a value in ``mapping``.
"""
updated_topics = mapping.get(kind, []) + topics
mapping[kind] = sorted(set(updated_topics))

def add_article_topics(self, kind: str, topics: list[str]) -> None:
"""Add article topics.

Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self._add_topics(self.article_topics, kind, topics)

def add_journal_topics(self, kind: str, topics: list[str]) -> None:
"""Add journal topics.

Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self._add_topics(self.journal_topics, kind, topics)

def json(self) -> dict:
FrancescoCasalegno marked this conversation as resolved.
Show resolved Hide resolved
"""Convert the contents of this class to a structured dictionary.

Apart from the source, path and topic entries a "metadata" top-level
key will be added containing a dictionary with entries "created-date"
and "bbs-version".

Returns
-------
dict
The structure dictionary with all topic information.
"""
metadata: dict[str, Any] = {
"created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"),
"bbs-version": bluesearch.__version__,
}
if self.element_in_file is not None:
metadata["element_in_file"] = self.element_in_file

json = {
"source": self.source.value,
"path": str(self.path),
EmilieDel marked this conversation as resolved.
Show resolved Hide resolved
"topics": {
"article": copy.deepcopy(self.article_topics),
"journal": copy.deepcopy(self.journal_topics),
EmilieDel marked this conversation as resolved.
Show resolved Hide resolved
},
"metadata": metadata,
}

return json
27 changes: 15 additions & 12 deletions src/bluesearch/entrypoint/database/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@
from itertools import chain
from pathlib import Path

from bluesearch.database.article import ArticleSource

logger = logging.getLogger(__name__)

# Data conventions and formats are different prior to these dates. We
# download only if the starting date is more recent or equal to the
# respective threshold.
MIN_DATE = {
# https://arxiv.org/help/arxiv_identifier#old
"arxiv": datetime(2007, 4, 1),
ArticleSource.ARXIV: datetime(2007, 4, 1),
# https://www.biorxiv.org/tdm + looked into Current Content folder on GPFS
"biorxiv": datetime(2018, 12, 1),
ArticleSource.BIORXIV: datetime(2018, 12, 1),
# https://www.medrxiv.org/tdm + looked into Current Content folder on GPFS
"medrxiv": datetime(2020, 10, 1),
ArticleSource.MEDRXIV: datetime(2020, 10, 1),
# This should change every year in December:
# see https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/
"pmc": datetime(2021, 12, 1),
ArticleSource.PMC: datetime(2021, 12, 1),
# This should change every year in December:
# see https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/
"pubmed": datetime(2021, 12, 1),
ArticleSource.PUBMED: datetime(2021, 12, 1),
}


Expand Down Expand Up @@ -88,7 +90,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument(
"source",
type=str,
choices=("arxiv", "biorxiv", "medrxiv", "pmc", "pubmed"),
choices=[member.value for member in ArticleSource],
help="Source of the download.",
)
parser.add_argument(
Expand Down Expand Up @@ -129,16 +131,17 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
get_s3_urls,
)

if from_month < MIN_DATE[source]:
article_source = ArticleSource(source)
if from_month < MIN_DATE[article_source]:
logger.error(
f"The papers from before {MIN_DATE[source].strftime('%B %Y')} "
f"The papers from before {MIN_DATE[article_source].strftime('%B %Y')} "
"follow a different format and can't be downloaded. "
"Please contact the developers if you need them. "
"To proceed please re-run the command with a different starting month."
)
return 1

if source == "pmc":
if article_source == ArticleSource.PMC:
url_dict = {}
for component in {"author_manuscript", "oa_comm", "oa_noncomm"}:
url_dict[component] = generate_pmc_urls(component, from_month)
Expand All @@ -158,7 +161,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
component_dir.mkdir(exist_ok=True, parents=True)
download_articles(url_list, component_dir)
return 0
elif source == "pubmed":
elif article_source == ArticleSource.PUBMED:
url_list = get_pubmed_urls(from_month)
if dry_run:
print("URL requests from:")
Expand All @@ -169,7 +172,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
output_dir.mkdir(exist_ok=True, parents=True)
download_articles(url_list, output_dir)
return 0
elif source in {"biorxiv", "medrxiv"}:
elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:

key_id = getpass.getpass("aws_access_key_id: ")
secret_access_key = getpass.getpass("aws_secret_access_key: ")
Expand All @@ -192,7 +195,7 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i
logger.info(f"Start downloading {source} papers.")
download_s3_articles(bucket, url_dict, output_dir)
return 0
elif source == "arxiv":
elif article_source == ArticleSource.ARXIV:
logger.info("Loading libraries")
from google.cloud.storage import Client

Expand Down
Loading