Skip to content
This repository has been archived by the owner on Jan 29, 2024. It is now read-only.

Add the TopicInfo data structure #542

Merged
merged 26 commits into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f42260f
Add the ArticleSource enum
Jan 10, 2022
7da9448
Add the TopicInfo class
Jan 10, 2022
f3d548d
Use TopicInfo in the topic_extract module
Jan 10, 2022
93beecb
Add tests for TopicInfo
Jan 10, 2022
ed2d403
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 10, 2022
0f2cb62
Fix failing test
Jan 10, 2022
e01b12a
Update topic_extract
Jan 10, 2022
b2712e3
Update whatsnew
Jan 10, 2022
37772aa
Turn the value of ArticleSource.UNKNOWN to a string
Jan 10, 2022
87ec2e2
Use ArticleSource values in topic extract CLI choices
Jan 10, 2022
9f5e831
Update API docs
Jan 10, 2022
458d8f3
Fix formatting
Jan 10, 2022
cba25d0
Streamline iteration over ArticleSource enum
Stannislav Jan 17, 2022
b1df1ae
Improve the handling of the source argument
Jan 17, 2022
827c79d
Remove metadata parameter from TopicInfo
Jan 17, 2022
5145e04
Resolve path in TopicInfo
Jan 17, 2022
fb0d21f
Use ArticleSource in the download command
Jan 17, 2022
0ec90d4
Make TopicInfo.add_topic private
Jan 17, 2022
c9fc877
Mention the "metadata" key in TopicInfo.json() docstring
Jan 17, 2022
2307565
Fix formatting
Jan 17, 2022
5c4d598
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 17, 2022
019cd3a
Merge branch 'master' into issues/539-topic-info-data-structure
Jan 18, 2022
100c4b5
Use TopicInfo in arXiv topic extraction
Jan 18, 2022
1cce0b8
Compare enum members by identity
Jan 18, 2022
de2d831
Use TopicInfo in {bio,med}Rxiv topic extraction
Jan 18, 2022
b3cce77
Remove unused imports
Jan 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/api/bluesearch.database.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Submodules
bluesearch.database.mining_cache
bluesearch.database.pdf
bluesearch.database.topic
bluesearch.database.topic_info

Module contents
---------------
Expand Down
7 changes: 7 additions & 0 deletions docs/source/api/bluesearch.database.topic_info.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
bluesearch.database.topic\_info module
======================================

.. automodule:: bluesearch.database.topic_info
:members:
:undoc-members:
:show-inheritance:
2 changes: 2 additions & 0 deletions docs/source/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Legend

Latest
======
- |Add| the :code:`bluesearch.database.topic_info.TopicInfo` class
- |Add| the :code:`bluesearch.database.article.ArticleSource` enum class
- |Add| extraction of journal and article topics for :code:`pubmed` papers
through CLI command :code:`bbs_database topic-extract pubmed`.
- |Add| extraction of journal topics for :code:`pmc` papers through CLI command
Expand Down
12 changes: 12 additions & 0 deletions src/bluesearch/database/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""Abstraction of scientific article data and related tools."""
from __future__ import annotations

import enum
import html
import re
import string
Expand All @@ -33,6 +34,17 @@
from bluesearch.database.identifiers import generate_uid


class ArticleSource(enum.Enum):
"""The source of an article."""

ARXIV = "arxiv"
BIORXIV = "biorxiv"
MEDRXIV = "medrxiv"
PMC = "pmc"
PUBMED = "pubmed"
UNKNOWN = "unknown"


def get_arxiv_id(path: str | Path) -> str | None:
"""Compute arXiv ID, including version, from file path.

Expand Down
116 changes: 116 additions & 0 deletions src/bluesearch/database/topic_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the TopicInfo data structure."""
from __future__ import annotations

import copy
import datetime
import pathlib
from dataclasses import dataclass, field
from typing import Any

import bluesearch
from bluesearch.database.article import ArticleSource


@dataclass
class TopicInfo:
"""The topic information extracted from a journal article.

For the spec see the following GitHub issue/comment:
https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160
"""

source: ArticleSource
path: str | pathlib.Path
element_in_file: int | None = None
article_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
metadata: dict = field(default_factory=dict)
jankrepl marked this conversation as resolved.
Show resolved Hide resolved
FrancescoCasalegno marked this conversation as resolved.
Show resolved Hide resolved

def __post_init__(self) -> None:
"""Run the post-initialization."""
self.creation_date = datetime.datetime.now()

@staticmethod
def add_topics(mapping: dict[str, list[str]], kind: str, topics: list[str]) -> None:
Stannislav marked this conversation as resolved.
Show resolved Hide resolved
"""Add topics to a mapping with collection of topics.

Parameters
----------
mapping
A mapping of the form kind -> list-of-topics that shall be
updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``.
kind
The topic kind. Corresponds to a key in ``mapping``.
topics
The topics to add. Corresponds to a value in ``mapping``.
"""
updated_topics = mapping.get(kind, []) + topics
mapping[kind] = sorted(set(updated_topics))

def add_article_topics(self, kind: str, topics: list[str]) -> None:
"""Add article topics.

Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self.add_topics(self.article_topics, kind, topics)

def add_journal_topics(self, kind: str, topics: list[str]) -> None:
"""Add journal topics.

Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A list of the topics to add.
"""
self.add_topics(self.journal_topics, kind, topics)

def json(self) -> dict:
FrancescoCasalegno marked this conversation as resolved.
Show resolved Hide resolved
"""Convert the contents of this class to a structured dictionary.

Returns
-------
dict
The structure dictionary with all topic information.
"""
metadata: dict[str, Any] = {
**copy.deepcopy(self.metadata),
"created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"),
"bbs-version": bluesearch.__version__,
}
jankrepl marked this conversation as resolved.
Show resolved Hide resolved
if self.element_in_file is not None:
metadata["element_in_file"] = self.element_in_file
EmilieDel marked this conversation as resolved.
Show resolved Hide resolved

json = {
"source": self.source.value,
"path": str(self.path),
EmilieDel marked this conversation as resolved.
Show resolved Hide resolved
"topics": {
"article": copy.deepcopy(self.article_topics),
"journal": copy.deepcopy(self.journal_topics),
EmilieDel marked this conversation as resolved.
Show resolved Hide resolved
},
"metadata": metadata,
}

return json
74 changes: 25 additions & 49 deletions src/bluesearch/entrypoint/database/topic_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
from __future__ import annotations

import argparse
import datetime
import logging
from pathlib import Path
from typing import Any

from bluesearch.database.article import ArticleSource

logger = logging.getLogger(__name__)


Expand All @@ -45,13 +46,7 @@ def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument(
"source",
type=str,
choices=(
"arxiv",
"biorxiv",
"medrxiv",
"pmc",
"pubmed",
),
choices=[member.value for member in ArticleSource.__members__.values()],
jankrepl marked this conversation as resolved.
Show resolved Hide resolved
FrancescoCasalegno marked this conversation as resolved.
Show resolved Hide resolved
help="""
Format of the input.
If extracting topic of several articles, all articles must have the same format.
Expand Down Expand Up @@ -129,12 +124,12 @@ def run(
"""
from defusedxml import ElementTree

import bluesearch
from bluesearch.database.topic import (
extract_article_topics_for_pubmed_article,
extract_journal_topics_for_pubmed_article,
get_topics_for_pmc_article,
)
from bluesearch.database.topic_info import TopicInfo
from bluesearch.utils import JSONL, find_files

try:
Expand All @@ -151,57 +146,38 @@ def run(
print(*inputs, sep="\n")
return 0

try:
article_source = ArticleSource(source)
except ValueError:
logger.error("Unknown article source: %s", source)
return 1
FrancescoCasalegno marked this conversation as resolved.
Show resolved Hide resolved
all_results: list[dict[str, Any]] = []

if source == "pmc":
if article_source == ArticleSource.PMC:
for path in inputs:
logger.info(f"Processing {path}")
topic_info = TopicInfo(source=article_source, path=path.resolve())
journal_topics = get_topics_for_pmc_article(path)
all_results.append(
{
"source": "pmc",
"path": str(path.resolve()),
"topics": {
"journal": {
"MeSH": journal_topics,
},
},
"metadata": {
"created-date": datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
"bbs-version": bluesearch.version.__version__,
},
}
)
elif source == "pubmed":
if journal_topics:
topic_info.add_journal_topics("MeSH", journal_topics)
all_results.append(topic_info.json())
elif article_source == ArticleSource.PUBMED:
for path in inputs:
logger.info(f"Processing {path}")
articles = ElementTree.parse(input_path)
for i, article in enumerate(articles.iter("PubmedArticle")):
topic_info = TopicInfo(
source=article_source,
path=path.resolve(),
element_in_file=i,
)
article_topics = extract_article_topics_for_pubmed_article(article)
journal_topics = extract_journal_topics_for_pubmed_article(article)
all_results.append(
{
"source": "pubmed",
"path": str(path.resolve()),
"topics": {
"journal": {
"MeSH": journal_topics,
},
"article": {
"MeSH": article_topics,
},
},
"metadata": {
"created-date": datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
"bbs-version": bluesearch.version.__version__,
"element_in_file": i,
},
}
)
if article_topics:
topic_info.add_article_topics("MeSH", article_topics)
if journal_topics:
topic_info.add_journal_topics("MeSH", journal_topics)
all_results.append(topic_info.json())
else:
logger.error(f"The source type {source!r} is not implemented yet")
return 1
Expand Down
80 changes: 80 additions & 0 deletions tests/unit/database/test_topic_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import datetime
import pathlib

import pytest

import bluesearch
from bluesearch.database.article import ArticleSource
from bluesearch.database.topic_info import TopicInfo


class TestTopicInfo:
def test_instantiation(self):
source = ArticleSource.ARXIV
path = pathlib.Path("/some/path.test")
metadata = {"some key": "some value"}
topic_info = TopicInfo(source, path, metadata=metadata)

assert topic_info.source == source
assert topic_info.path == path
assert metadata == metadata

@pytest.mark.parametrize(
("mapping", "kind", "topics", "result"),
(
({}, "MeSH", ["topic 1"], {"MeSH": ["topic 1"]}),
(
{"MeSH": ["topic 2"]},
"MeSH",
["topic 1"],
{"MeSH": ["topic 1", "topic 2"]},
),
({"MeSH": ["topic 1"]}, "MeSH", ["topic 1"], {"MeSH": ["topic 1"]}),
),
)
def test_add_topics(self, mapping, kind, topics, result):
TopicInfo.add_topics(mapping, kind, topics)
assert mapping == result

def test_add_article_journal_topics(self):
topic_info = TopicInfo(ArticleSource.UNKNOWN, "")
topic_info.add_article_topics("MeSH", ["AT 1", "AT 2", "AT 3"])
topic_info.add_journal_topics("MAP", ["JT 1", "JT 2"])

assert topic_info.article_topics == {"MeSH": ["AT 1", "AT 2", "AT 3"]}
assert topic_info.journal_topics == {"MAP": ["JT 1", "JT 2"]}

def test_json(self):
start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

topic_info = TopicInfo(
source=ArticleSource.PUBMED,
path=pathlib.Path("/some/path.test"),
element_in_file=5,
metadata={"some key": "some value"},
)
topic_info.add_article_topics("MeSH", ["AT 1", "AT 2", "AT 3"])
topic_info.add_journal_topics("MAP", ["JT 1", "JT 2"])

end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

json = topic_info.json()
metadata = json.pop("metadata")
assert json == {
"source": ArticleSource.PUBMED.value,
"path": "/some/path.test",
"topics": {
"article": {"MeSH": ["AT 1", "AT 2", "AT 3"]},
"journal": {"MAP": ["JT 1", "JT 2"]},
},
}
assert start <= metadata["created-date"] <= end
assert metadata["bbs-version"] == bluesearch.__version__
assert metadata["some key"] == "some value"

def test_element_in_file(self):
json = TopicInfo(ArticleSource.UNKNOWN, "").json()
assert json["metadata"].get("element_in_file") is None

json = TopicInfo(ArticleSource.UNKNOWN, "", element_in_file=5).json()
assert json["metadata"].get("element_in_file") == 5
3 changes: 1 addition & 2 deletions tests/unit/entrypoint/database/test_topic_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_wrong_source(test_data_path, caplog, tmp_path):
dry_run=False,
)
assert exit_code == 1
assert "The source type" in caplog.text
assert "Unknown article source" in caplog.text


def test_dry_run(test_data_path, capsys, tmp_path):
Expand Down Expand Up @@ -132,7 +132,6 @@ def test_pmc_source(test_data_path, capsys, monkeypatch, tmp_path):
assert result["path"] == str(pmc_path)
assert isinstance(result["topics"], dict)
topics = result["topics"]
assert "article" not in topics
assert "journal" in topics
assert isinstance(topics["journal"], dict)
assert topics["journal"]["MeSH"] == meshes
Expand Down